Lisa Over, Homework 2

Parts I - III

In [1]:
import warnings;
warnings.filterwarnings('ignore');
In [2]:
# Import libaries
import csv
import pandas as pd
from numpy.random import seed
from numpy.random import randn
from statsmodels.graphics.gofplots import qqplot
from matplotlib import pyplot
import matplotlib.pyplot as plt
import matplotlib
from numpy import exp
import numpy as np
import os
import math
import seaborn as sns
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV

Adult_data.txt

In [3]:
# Load data from text file to dataframe
adult_df = pd.read_csv('adult_data.txt', sep = ",", header = None)
adult_df.columns = ["Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
                    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
                    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"]
In [4]:
adult_df.head()
Out[4]:
Age WorkClass fnlwgt Education EducationNum MaritalStatus Occupation Relationship Race Gender CapitalGain CapitalLoss HoursPerWeek NativeCountry Income
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
In [5]:
adult_df.shape
Out[5]:
(32561, 15)

Data Evaluation and Cleaning for Adult Dataset

In [6]:
#Check nulls per column
adult_df.isnull().sum(axis = 0)
Out[6]:
Age              0
WorkClass        0
fnlwgt           0
Education        0
EducationNum     0
MaritalStatus    0
Occupation       0
Relationship     0
Race             0
Gender           0
CapitalGain      0
CapitalLoss      0
HoursPerWeek     0
NativeCountry    0
Income           0
dtype: int64
In [7]:
adult_df.dtypes
Out[7]:
Age               int64
WorkClass        object
fnlwgt            int64
Education        object
EducationNum      int64
MaritalStatus    object
Occupation       object
Relationship     object
Race             object
Gender           object
CapitalGain       int64
CapitalLoss       int64
HoursPerWeek      int64
NativeCountry    object
Income           object
dtype: object
In [8]:
adult_cpy = adult_df.copy()

WorkClass

In [9]:
# List categories of WorkClass
labels = adult_cpy['WorkClass'].astype('category').cat.categories.tolist()
labels
Out[9]:
[' ?',
 ' Federal-gov',
 ' Local-gov',
 ' Never-worked',
 ' Private',
 ' Self-emp-inc',
 ' Self-emp-not-inc',
 ' State-gov',
 ' Without-pay']
In [10]:
# Strip the leading/trailing spaces from the WorkClass values
adult_cpy['WorkClass'] = adult_cpy['WorkClass'].str.strip()

# List categories of WorkClass
labels = adult_cpy['WorkClass'].astype('category').cat.categories.tolist()
labels
Out[10]:
['?',
 'Federal-gov',
 'Local-gov',
 'Never-worked',
 'Private',
 'Self-emp-inc',
 'Self-emp-not-inc',
 'State-gov',
 'Without-pay']
Never-worked category of WorkClass and the Occupation column

All rows with WorkClass = 'Never-worked' (7 total) have question marks in Occupation. Therefore, change these question marks to 'Never-worked' to create a new Occupation category.

In [11]:
# Show rows with 'Never-worked' in WorkClass column and question mark in Occupation column
nw = adult_cpy[(adult_cpy['WorkClass'] == 'Never-worked')]
nw.shape
nw
Out[11]:
Age WorkClass fnlwgt Education EducationNum MaritalStatus Occupation Relationship Race Gender CapitalGain CapitalLoss HoursPerWeek NativeCountry Income
5361 18 Never-worked 206359 10th 6 Never-married ? Own-child White Male 0 0 40 United-States <=50K
10845 23 Never-worked 188535 7th-8th 4 Divorced ? Not-in-family White Male 0 0 35 United-States <=50K
14772 17 Never-worked 237272 10th 6 Never-married ? Own-child White Male 0 0 30 United-States <=50K
20337 18 Never-worked 157131 11th 7 Never-married ? Own-child White Female 0 0 10 United-States <=50K
23232 20 Never-worked 462294 Some-college 10 Never-married ? Own-child Black Male 0 0 40 United-States <=50K
32304 30 Never-worked 176673 HS-grad 9 Married-civ-spouse ? Wife Black Female 0 0 40 United-States <=50K
32314 18 Never-worked 153663 Some-college 10 Never-married ? Own-child White Male 0 0 4 United-States <=50K
In [12]:
adult_cpy.Occupation = np.where(adult_cpy.WorkClass == 'Never-worked', 'Never-worked', adult_cpy.Occupation)
In [13]:
# Show rows with 'Never-worked' in WorkClass column
nw = adult_cpy[adult_cpy['WorkClass'] == 'Never-worked']
nw
Out[13]:
Age WorkClass fnlwgt Education EducationNum MaritalStatus Occupation Relationship Race Gender CapitalGain CapitalLoss HoursPerWeek NativeCountry Income
5361 18 Never-worked 206359 10th 6 Never-married Never-worked Own-child White Male 0 0 40 United-States <=50K
10845 23 Never-worked 188535 7th-8th 4 Divorced Never-worked Not-in-family White Male 0 0 35 United-States <=50K
14772 17 Never-worked 237272 10th 6 Never-married Never-worked Own-child White Male 0 0 30 United-States <=50K
20337 18 Never-worked 157131 11th 7 Never-married Never-worked Own-child White Female 0 0 10 United-States <=50K
23232 20 Never-worked 462294 Some-college 10 Never-married Never-worked Own-child Black Male 0 0 40 United-States <=50K
32304 30 Never-worked 176673 HS-grad 9 Married-civ-spouse Never-worked Wife Black Female 0 0 40 United-States <=50K
32314 18 Never-worked 153663 Some-college 10 Never-married Never-worked Own-child White Male 0 0 4 United-States <=50K
In [14]:
freq = pd.crosstab(adult_cpy['WorkClass'], columns='count')
freq.head(len(labels))
Out[14]:
col_0 count
WorkClass
? 1836
Federal-gov 960
Local-gov 2093
Never-worked 7
Private 22696
Self-emp-inc 1116
Self-emp-not-inc 2541
State-gov 1298
Without-pay 14
In [15]:
# Show rows with '?' in WorkCLass column
WkClass = adult_cpy[adult_cpy['WorkClass'] == '?']
WkClass.head()
Out[15]:
Age WorkClass fnlwgt Education EducationNum MaritalStatus Occupation Relationship Race Gender CapitalGain CapitalLoss HoursPerWeek NativeCountry Income
27 54 ? 180211 Some-college 10 Married-civ-spouse ? Husband Asian-Pac-Islander Male 0 0 60 South >50K
61 32 ? 293936 7th-8th 4 Married-spouse-absent ? Not-in-family White Male 0 0 40 ? <=50K
69 25 ? 200681 Some-college 10 Never-married ? Own-child White Male 0 0 40 United-States <=50K
77 67 ? 212759 10th 6 Married-civ-spouse ? Husband White Male 0 0 2 United-States <=50K
106 17 ? 304873 10th 6 Never-married ? Own-child White Female 34095 0 32 United-States <=50K

Education

In [16]:
# Strip the leading/trailing spaces from the Education values
adult_cpy['Education'] = adult_cpy['Education'].str.strip()

# List categories of Education
labels = adult_cpy['Education'].astype('category').cat.categories.tolist()
labels
Out[16]:
['10th',
 '11th',
 '12th',
 '1st-4th',
 '5th-6th',
 '7th-8th',
 '9th',
 'Assoc-acdm',
 'Assoc-voc',
 'Bachelors',
 'Doctorate',
 'HS-grad',
 'Masters',
 'Preschool',
 'Prof-school',
 'Some-college']

Marital Status

In [17]:
# Strip the leading/trailing spaces from the MaritalStatus values
adult_cpy['MaritalStatus'] = adult_cpy['MaritalStatus'].str.strip()

# List categories of MaritalStatus
labels = adult_cpy['MaritalStatus'].astype('category').cat.categories.tolist()
labels
Out[17]:
['Divorced',
 'Married-AF-spouse',
 'Married-civ-spouse',
 'Married-spouse-absent',
 'Never-married',
 'Separated',
 'Widowed']

Occupation

In [18]:
# Strip the leading/trailing spaces from the Occupation values
adult_cpy['Occupation'] = adult_cpy['Occupation'].str.strip()

# List categories of Occupation
labels = adult_cpy['Occupation'].astype('category').cat.categories.tolist()
labels
Out[18]:
['?',
 'Adm-clerical',
 'Armed-Forces',
 'Craft-repair',
 'Exec-managerial',
 'Farming-fishing',
 'Handlers-cleaners',
 'Machine-op-inspct',
 'Never-worked',
 'Other-service',
 'Priv-house-serv',
 'Prof-specialty',
 'Protective-serv',
 'Sales',
 'Tech-support',
 'Transport-moving']
In [19]:
freq = pd.crosstab(adult_cpy['Occupation'], columns='count')
freq.head(len(labels))
Out[19]:
col_0 count
Occupation
? 1836
Adm-clerical 3770
Armed-Forces 9
Craft-repair 4099
Exec-managerial 4066
Farming-fishing 994
Handlers-cleaners 1370
Machine-op-inspct 2002
Never-worked 7
Other-service 3295
Priv-house-serv 149
Prof-specialty 4140
Protective-serv 649
Sales 3650
Tech-support 928
Transport-moving 1597
In [20]:
# Show rows with '?' in Occupation column
Occ = adult_cpy[adult_cpy['Occupation'] == '?']
Occ.head()
Out[20]:
Age WorkClass fnlwgt Education EducationNum MaritalStatus Occupation Relationship Race Gender CapitalGain CapitalLoss HoursPerWeek NativeCountry Income
27 54 ? 180211 Some-college 10 Married-civ-spouse ? Husband Asian-Pac-Islander Male 0 0 60 South >50K
61 32 ? 293936 7th-8th 4 Married-spouse-absent ? Not-in-family White Male 0 0 40 ? <=50K
69 25 ? 200681 Some-college 10 Never-married ? Own-child White Male 0 0 40 United-States <=50K
77 67 ? 212759 10th 6 Married-civ-spouse ? Husband White Male 0 0 2 United-States <=50K
106 17 ? 304873 10th 6 Never-married ? Own-child White Female 34095 0 32 United-States <=50K

Relationship

In [21]:
# Strip the leading/trailing spaces from the Relationship values
adult_cpy['Relationship'] = adult_cpy['Relationship'].str.strip()

# List categories of Relationship
labels = adult_cpy['Relationship'].astype('category').cat.categories.tolist()
labels
Out[21]:
['Husband',
 'Not-in-family',
 'Other-relative',
 'Own-child',
 'Unmarried',
 'Wife']

Race

In [22]:
# Strip the leading/trailing spaces from the Race values
adult_cpy['Race'] = adult_cpy['Race'].str.strip()

# List categories of Race
labels = adult_cpy['Race'].astype('category').cat.categories.tolist()
labels
Out[22]:
['Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White']

Gender

In [23]:
# Strip the leading/trailing spaces from the Gender values
adult_cpy['Gender'] = adult_cpy['Gender'].str.strip()

# List categories of Gender
labels = adult_cpy['Gender'].astype('category').cat.categories.tolist()
labels
Out[23]:
['Female', 'Male']

NativeCountry

In [24]:
# Strip the leading/trailing spaces from the NativeCountry values
adult_cpy['NativeCountry'] = adult_cpy['NativeCountry'].str.strip()

# List categories of NativeCountry
labels = adult_cpy['NativeCountry'].astype('category').cat.categories.tolist()
labels
Out[24]:
['?',
 'Cambodia',
 'Canada',
 'China',
 'Columbia',
 'Cuba',
 'Dominican-Republic',
 'Ecuador',
 'El-Salvador',
 'England',
 'France',
 'Germany',
 'Greece',
 'Guatemala',
 'Haiti',
 'Holand-Netherlands',
 'Honduras',
 'Hong',
 'Hungary',
 'India',
 'Iran',
 'Ireland',
 'Italy',
 'Jamaica',
 'Japan',
 'Laos',
 'Mexico',
 'Nicaragua',
 'Outlying-US(Guam-USVI-etc)',
 'Peru',
 'Philippines',
 'Poland',
 'Portugal',
 'Puerto-Rico',
 'Scotland',
 'South',
 'Taiwan',
 'Thailand',
 'Trinadad&Tobago',
 'United-States',
 'Vietnam',
 'Yugoslavia']
In [25]:
freq = pd.crosstab(adult_cpy['NativeCountry'], columns='count')
freq.head(len(labels))
Out[25]:
col_0 count
NativeCountry
? 583
Cambodia 19
Canada 121
China 75
Columbia 59
Cuba 95
Dominican-Republic 70
Ecuador 28
El-Salvador 106
England 90
France 29
Germany 137
Greece 29
Guatemala 64
Haiti 44
Holand-Netherlands 1
Honduras 13
Hong 20
Hungary 13
India 100
Iran 43
Ireland 24
Italy 73
Jamaica 81
Japan 62
Laos 18
Mexico 643
Nicaragua 34
Outlying-US(Guam-USVI-etc) 14
Peru 31
Philippines 198
Poland 60
Portugal 37
Puerto-Rico 114
Scotland 12
South 80
Taiwan 51
Thailand 18
Trinadad&Tobago 19
United-States 29170
Vietnam 67
Yugoslavia 16

Income

In [26]:
# Strip the leading/trailing spaces from the NativeCountry values
adult_cpy['Income'] = adult_cpy['Income'].str.strip()

# List categories of NativeCountry
labels = adult_cpy['Income'].astype('category').cat.categories.tolist()
labels
Out[26]:
['<=50K', '>50K']
In [27]:
# Show rows with '?' in NativeCountry column
#NatCnty = adult_cpy[adult_cpy['NativeCountry'] == '?']
#NatCnty

Compare full adult_cpy dataset (with '?' values) to a new dataset with '?' removed (no rows with '?')

For each variable, compare proportion of data in each of it's categories between the two datasets. If the proportion does not change between the full and partial datasets, then there is no pattern to the rows that contain '?' -- then delete the rows with '?' to create adult_known dataset.

In [28]:
# Remove rows with '?'
adult_known = adult_cpy[(adult_cpy['WorkClass'] != '?') & (adult_cpy['Occupation'] != '?') & 
                       (adult_cpy['NativeCountry'] != '?')]
adult_known.shape
Out[28]:
(30169, 15)

WorkClass categories have similar proportions between the full and partial datasets

In [29]:
labels = adult_known['WorkClass'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['WorkClass'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
Out[29]:
col_0 proportion
WorkClass
Federal-gov 0.031257
Local-gov 0.068514
Never-worked 0.000232
Private 0.738705
Self-emp-inc 0.035599
Self-emp-not-inc 0.082833
State-gov 0.042395
Without-pay 0.000464
In [30]:
labels = adult_cpy['WorkClass'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_cpy['WorkClass'], columns='proportion').apply(lambda r: r/len(adult_cpy), axis=1)
freq.head(len(labels))
Out[30]:
col_0 proportion
WorkClass
? 0.056386
Federal-gov 0.029483
Local-gov 0.064279
Never-worked 0.000215
Private 0.697030
Self-emp-inc 0.034274
Self-emp-not-inc 0.078038
State-gov 0.039864
Without-pay 0.000430

Education categories have similar proportions between the full and partial datasets

In [31]:
labels = adult_known['Education'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['Education'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
Out[31]:
col_0 proportion
Education
10th 0.027247
11th 0.034771
12th 0.012496
1st-4th 0.005005
5th-6th 0.009546
7th-8th 0.018496
9th 0.015082
Assoc-acdm 0.033412
Assoc-voc 0.043323
Bachelors 0.167191
Doctorate 0.012430
HS-grad 0.326196
Masters 0.053930
Preschool 0.001492
Prof-school 0.017965
Some-college 0.221419
In [32]:
labels = adult_cpy['Education'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_cpy['Education'], columns='proportion').apply(lambda r: r/len(adult_cpy), axis=1)
freq.head(len(labels))
Out[32]:
col_0 proportion
Education
10th 0.028654
11th 0.036086
12th 0.013298
1st-4th 0.005160
5th-6th 0.010227
7th-8th 0.019840
9th 0.015786
Assoc-acdm 0.032769
Assoc-voc 0.042443
Bachelors 0.164461
Doctorate 0.012684
HS-grad 0.322502
Masters 0.052916
Preschool 0.001566
Prof-school 0.017690
Some-college 0.223918

MaritalStatus categories have similar proportions between the full and partial datasets

In [33]:
labels = adult_known['MaritalStatus'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['MaritalStatus'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
Out[33]:
col_0 proportion
MaritalStatus
Divorced 0.139713
Married-AF-spouse 0.000696
Married-civ-spouse 0.466240
Married-spouse-absent 0.012264
Never-married 0.322550
Separated 0.031125
Widowed 0.027412
In [34]:
labels = adult_cpy['MaritalStatus'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_cpy['MaritalStatus'], columns='proportion').apply(lambda r: r/len(adult_cpy), axis=1)
freq.head(len(labels))
Out[34]:
col_0 proportion
MaritalStatus
Divorced 0.136452
Married-AF-spouse 0.000706
Married-civ-spouse 0.459937
Married-spouse-absent 0.012837
Never-married 0.328092
Separated 0.031479
Widowed 0.030497

Occupation categories have similar proportions between the full and partial datasets

In [35]:
labels = adult_known['Occupation'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['Occupation'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
Out[35]:
col_0 proportion
Occupation
Adm-clerical 0.123339
Armed-Forces 0.000298
Craft-repair 0.133581
Exec-managerial 0.132321
Farming-fishing 0.032782
Handlers-cleaners 0.044748
Machine-op-inspct 0.065166
Never-worked 0.000232
Other-service 0.106467
Priv-house-serv 0.004740
Prof-specialty 0.133846
Protective-serv 0.021346
Sales 0.118797
Tech-support 0.030230
Transport-moving 0.052106
In [36]:
labels = adult_cpy['Occupation'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_cpy['Occupation'], columns='proportion').apply(lambda r: r/len(adult_cpy), axis=1)
freq.head(len(labels))
Out[36]:
col_0 proportion
Occupation
? 0.056386
Adm-clerical 0.115783
Armed-Forces 0.000276
Craft-repair 0.125887
Exec-managerial 0.124873
Farming-fishing 0.030527
Handlers-cleaners 0.042075
Machine-op-inspct 0.061485
Never-worked 0.000215
Other-service 0.101195
Priv-house-serv 0.004576
Prof-specialty 0.127146
Protective-serv 0.019932
Sales 0.112097
Tech-support 0.028500
Transport-moving 0.049046

Relationship categories have similar proportions between the full and partial datasets

In [37]:
labels = adult_known['Relationship'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['Relationship'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
Out[37]:
col_0 proportion
Relationship
Husband 0.413106
Not-in-family 0.256124
Other-relative 0.029467
Own-child 0.148198
Unmarried 0.106467
Wife 0.046637
In [38]:
labels = adult_cpy['Relationship'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_cpy['Relationship'], columns='proportion').apply(lambda r: r/len(adult_cpy), axis=1)
freq.head(len(labels))
Out[38]:
col_0 proportion
Relationship
Husband 0.405178
Not-in-family 0.255060
Other-relative 0.030128
Own-child 0.155646
Unmarried 0.105832
Wife 0.048156

Race categories have similar proportions between the full and partial datasets

In [39]:
labels = adult_known['Race'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['Race'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
Out[39]:
col_0 proportion
Race
Amer-Indian-Eskimo 0.009480
Asian-Pac-Islander 0.029666
Black 0.093440
Other 0.007657
White 0.859757
In [40]:
labels = adult_cpy['Race'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_cpy['Race'], columns='proportion').apply(lambda r: r/len(adult_cpy), axis=1)
freq.head(len(labels))
Out[40]:
col_0 proportion
Race
Amer-Indian-Eskimo 0.009551
Asian-Pac-Islander 0.031909
Black 0.095943
Other 0.008323
White 0.854274

Gender categories have similar proportions between the full and partial datasets

In [41]:
labels = adult_known['Gender'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['Gender'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
Out[41]:
col_0 proportion
Gender
Female 0.324306
Male 0.675694
In [42]:
labels = adult_cpy['Gender'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_cpy['Gender'], columns='proportion').apply(lambda r: r/len(adult_cpy), axis=1)
freq.head(len(labels))
Out[42]:
col_0 proportion
Gender
Female 0.330795
Male 0.669205
In [43]:
labels = adult_known['Income'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['Income'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
Out[43]:
col_0 proportion
Income
<=50K 0.751135
>50K 0.248865
In [44]:
labels = adult_cpy['Income'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_cpy['Income'], columns='proportion').apply(lambda r: r/len(adult_cpy), axis=1)
freq.head(len(labels))
Out[44]:
col_0 proportion
Income
<=50K 0.75919
>50K 0.24081
In [45]:
adult_known['WorkClass_c'] = adult_known['WorkClass']
adult_known.head()
Out[45]:
Age WorkClass fnlwgt Education EducationNum MaritalStatus Occupation Relationship Race Gender CapitalGain CapitalLoss HoursPerWeek NativeCountry Income WorkClass_c
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K State-gov
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K Self-emp-not-inc
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K Private
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K Private
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K Private

Rows with question marks in WorkClass, Occupation, or NativeCountry

The distributions of variables did not change significantly when the rows with question marks were removed. Therefore, use the dataset with all known values to fit models.

Create coded variable for WorkClass and dummy variables for other categorical variables

In [46]:
adult_known.min()
Out[46]:
Age                              17
WorkClass               Federal-gov
fnlwgt                        13769
Education                      10th
EducationNum                      1
MaritalStatus              Divorced
Occupation             Adm-clerical
Relationship                Husband
Race             Amer-Indian-Eskimo
Gender                       Female
CapitalGain                       0
CapitalLoss                       0
HoursPerWeek                      1
NativeCountry              Cambodia
Income                        <=50K
WorkClass_c             Federal-gov
dtype: object
In [47]:
adult_known.max()
Out[47]:
Age                            90
WorkClass             Without-pay
fnlwgt                    1484705
Education            Some-college
EducationNum                   16
MaritalStatus             Widowed
Occupation       Transport-moving
Relationship                 Wife
Race                        White
Gender                       Male
CapitalGain                 99999
CapitalLoss                  4356
HoursPerWeek                   99
NativeCountry          Yugoslavia
Income                       >50K
WorkClass_c           Without-pay
dtype: object
In [48]:
# Create a numeric variable for WorkClass
labels = adult_known['WorkClass_c'].astype('category').cat.categories.tolist()
replace_map_comp = {'WorkClass_c' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
adult_known.replace(replace_map_comp, inplace=True)

# Create dummy variables of categorical variables
adult_known = pd.get_dummies(adult_known, columns=['WorkClass'])
adult_known = pd.get_dummies(adult_known, columns=['Education'])
adult_known = pd.get_dummies(adult_known, columns=['MaritalStatus'])
adult_known = pd.get_dummies(adult_known, columns=['Occupation'])
adult_known = pd.get_dummies(adult_known, columns=['Relationship'])
adult_known = pd.get_dummies(adult_known, columns=['Race'])
adult_known = pd.get_dummies(adult_known, columns=['Gender'])
adult_known = pd.get_dummies(adult_known, columns=['NativeCountry'])
adult_known = pd.get_dummies(adult_known, columns=['Income'])

adult_known['age_bins'] = pd.cut(x=adult_known['Age'], bins=[10, 19, 29, 39, 49, 59, 69, 79, 89, 99])
adult_known = pd.get_dummies(adult_known, columns=['age_bins'])

adult_known.head()
Out[48]:
Age fnlwgt EducationNum CapitalGain CapitalLoss HoursPerWeek WorkClass_c WorkClass_Federal-gov WorkClass_Local-gov WorkClass_Never-worked ... Income_>50K age_bins_(10, 19] age_bins_(19, 29] age_bins_(29, 39] age_bins_(39, 49] age_bins_(49, 59] age_bins_(59, 69] age_bins_(69, 79] age_bins_(79, 89] age_bins_(89, 99]
0 39 77516 13 2174 0 40 7 0 0 0 ... 0 0 0 1 0 0 0 0 0 0
1 50 83311 13 0 0 13 6 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
2 38 215646 9 0 0 40 4 0 0 0 ... 0 0 0 1 0 0 0 0 0 0
3 53 234721 7 0 0 40 4 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
4 28 338409 13 0 0 40 4 0 0 0 ... 0 0 1 0 0 0 0 0 0 0

5 rows × 118 columns

Check normality assumption for continuous variables

fnlwgt

In [49]:
# Box-plot of 'fnlwgt' column - no transformation
sns.boxplot(x = adult_known['fnlwgt'])
Out[49]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a23a3c110>
In [50]:
# Box-plot of 'fnlwgt' column - transformed to resolve non-normality
sns.boxplot(x = adult_known['fnlwgt']**0.1)
Out[50]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a23a85e50>
In [51]:
# q-q plot for the 'fnlwgt' column - no transformation
qqplot(adult_known['fnlwgt'], line='s')
pyplot.show()
In [52]:
# q-q plot for the 'fnlwgt' column - transformed to resolve non-normality
adult_known['fnlwgt_t'] = adult_known['fnlwgt']**0.1
qqplot(adult_known['fnlwgt_t'], line='s')
pyplot.show()
In [53]:
adult_known.head()
Out[53]:
Age fnlwgt EducationNum CapitalGain CapitalLoss HoursPerWeek WorkClass_c WorkClass_Federal-gov WorkClass_Local-gov WorkClass_Never-worked ... age_bins_(10, 19] age_bins_(19, 29] age_bins_(29, 39] age_bins_(39, 49] age_bins_(49, 59] age_bins_(59, 69] age_bins_(69, 79] age_bins_(79, 89] age_bins_(89, 99] fnlwgt_t
0 39 77516 13 2174 0 40 7 0 0 0 ... 0 0 1 0 0 0 0 0 0 3.082756
1 50 83311 13 0 0 13 6 0 0 0 ... 0 0 0 0 1 0 0 0 0 3.105062
2 38 215646 9 0 0 40 4 0 0 0 ... 0 0 1 0 0 0 0 0 0 3.414870
3 53 234721 7 0 0 40 4 0 0 0 ... 0 0 0 0 1 0 0 0 0 3.443937
4 28 338409 13 0 0 40 4 0 0 0 ... 0 1 0 0 0 0 0 0 0 3.572269

5 rows × 119 columns

CapitalGain

In [54]:
# q-q plot for the 'CapitalGain' column - too many zero values
qqplot(adult_known['CapitalGain'], line='s')
pyplot.show()
In [55]:
# q-q plot for the 'CapitalGain' column
# Too many zero values to be transformed to resolve non-normality even with log(x+1)
adult_known['CapitalGain_t'] = np.log(adult_known['CapitalGain']+1)
qqplot(adult_known['CapitalGain_t'], line='s')
pyplot.show()

CapitalLoss

In [56]:
# q-q plot for the 'CapitalLoss' column - too many zero values to be transformed to resolve non-normality
qqplot(adult_known['CapitalLoss'], line='s')
pyplot.show()
In [57]:
# q-q plot for the 'CapitalLoss' column
# Too many zero values to be transformed to resolve non-normality even with log(x+1)
adult_known['CapitalLoss_t'] = np.log(adult_known['CapitalLoss']+1)
qqplot(adult_known['CapitalLoss_t'], line='s')
pyplot.show()
In [58]:
# q-q plot for difference between CapitalGain and CapitalLoss - transformed for normality
adult_known['CapitalDiff_t'] = (adult_known['CapitalGain'] - adult_known['CapitalLoss'] + 1)**10
qqplot(adult_known['CapitalDiff_t'], line='s')
pyplot.show()

HoursPerWeek

In [59]:
# Box-plot of 'HoursPerWeek' column - no transformation - transformed to resolve non-normality
sns.boxplot(x = adult_known['HoursPerWeek']**0.75)
Out[59]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a23fed3d0>
In [60]:
# q-q plot for the 'HoursPerWeek' column - transformed to resolve non-normality
adult_known['HoursPerWeek_t'] = adult_known['HoursPerWeek']**0.75
qqplot(adult_known['HoursPerWeek_t']**0.75, line='s')
pyplot.show()
In [61]:
adult_known.head()
Out[61]:
Age fnlwgt EducationNum CapitalGain CapitalLoss HoursPerWeek WorkClass_c WorkClass_Federal-gov WorkClass_Local-gov WorkClass_Never-worked ... age_bins_(49, 59] age_bins_(59, 69] age_bins_(69, 79] age_bins_(79, 89] age_bins_(89, 99] fnlwgt_t CapitalGain_t CapitalLoss_t CapitalDiff_t HoursPerWeek_t
0 39 77516 13 2174 0 40 7 0 0 0 ... 0 0 0 0 0 3.082756 7.684784 0.0 7852521921562471169 15.905415
1 50 83311 13 0 0 13 6 0 0 0 ... 1 0 0 0 0 3.105062 0.000000 0.0 1 6.846325
2 38 215646 9 0 0 40 4 0 0 0 ... 0 0 0 0 0 3.414870 0.000000 0.0 1 15.905415
3 53 234721 7 0 0 40 4 0 0 0 ... 1 0 0 0 0 3.443937 0.000000 0.0 1 15.905415
4 28 338409 13 0 0 40 4 0 0 0 ... 0 0 0 0 0 3.572269 0.000000 0.0 1 15.905415

5 rows × 123 columns

In [62]:
# Box-plot of 'Age' column - no transformation
sns.boxplot(x = adult_known['Age'])
Out[62]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a274942d0>
In [63]:
# Box-plot of 'Age' column - transformed to resolve non-normality
sns.boxplot(x = np.log(adult_known['Age']))
Out[63]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a2770ad90>
In [64]:
# q-q plot for the 'Age' column - no transformation
qqplot(adult_known['Age'], line='s')
pyplot.show()
In [65]:
# q-q plot for the 'Age' column - transformed but this graph does not show resolved non-normality
adult_known['Age_t'] = np.log(adult_known['Age'])
qqplot(adult_known['Age_t']**0.1, line='s')
pyplot.show()
In [66]:
adult_known.head()
Out[66]:
Age fnlwgt EducationNum CapitalGain CapitalLoss HoursPerWeek WorkClass_c WorkClass_Federal-gov WorkClass_Local-gov WorkClass_Never-worked ... age_bins_(59, 69] age_bins_(69, 79] age_bins_(79, 89] age_bins_(89, 99] fnlwgt_t CapitalGain_t CapitalLoss_t CapitalDiff_t HoursPerWeek_t Age_t
0 39 77516 13 2174 0 40 7 0 0 0 ... 0 0 0 0 3.082756 7.684784 0.0 7852521921562471169 15.905415 3.663562
1 50 83311 13 0 0 13 6 0 0 0 ... 0 0 0 0 3.105062 0.000000 0.0 1 6.846325 3.912023
2 38 215646 9 0 0 40 4 0 0 0 ... 0 0 0 0 3.414870 0.000000 0.0 1 15.905415 3.637586
3 53 234721 7 0 0 40 4 0 0 0 ... 0 0 0 0 3.443937 0.000000 0.0 1 15.905415 3.970292
4 28 338409 13 0 0 40 4 0 0 0 ... 0 0 0 0 3.572269 0.000000 0.0 1 15.905415 3.332205

5 rows × 124 columns

Question 1: Predict the Working Class with Naive Bayes

Gaussian Naive Bayes Model

The CaptialGain and CapitalLoss variables are not included in the Gaussian model below because they do not meet the assumption of normality. Even with the log(x+1) transformation, these variables did not become more normally distributed. Even with the assumption violation, these variables, along with a transformation of the difference between these two values, were tried in the model but none of them yielded any improvement.

The multinomial Gaussian naive Bayes model with 'Age_t', 'fnlwgt_t', and 'HoursPerWeek_t' has a high accuracy for both train and test, 0.7369 and 0.7236, respectively. However, this model only predicts classes 3, 4, and 6. This model was aborted in favor of the multinomial Bernoulli naive Bayes. See below for that model and the answers to the homework questions.

In [67]:
# Obtain distribution of WorkClass_c for use as priors in models
labels = adult_known['WorkClass_c'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['WorkClass_c'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
Out[67]:
col_0 proportion
WorkClass_c
1 0.031257
2 0.068514
3 0.000232
4 0.738705
5 0.035599
6 0.082833
7 0.042395
8 0.000464
In [68]:
# Create a Gaussian Naive Bayes model
gauss_xvars = adult_known[['Age_t', 'fnlwgt_t', 'HoursPerWeek_t']]
target = adult_known[['WorkClass_c']]
In [69]:
# Scale and split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(preprocessing.scale(gauss_xvars), 
                                                    target, 
                                                    test_size=0.2, 
                                                    random_state=1)
In [70]:
mnbg_clf = GaussianNB(
    #class_prior = [0.0313, 0.0685, 0.0002, 0.7387, 0.0356, 0.0828, 0.0424, 0.0005]
                              )
In [71]:
mnb_gmodel = mnbg_clf.fit(x_train, y_train)
In [72]:
y_pred_train = y_train.copy()
y_pred_train['CLASS_PREDS'] = mnb_gmodel.predict(x_train)
In [73]:
pd.crosstab(y_pred_train['WorkClass_c'], y_pred_train['CLASS_PREDS'],
            rownames=['True'], colnames=['Predicted'], margins=True)
Out[73]:
Predicted 3 4 6 All
True
1 0 736 10 746
2 0 1595 30 1625
3 0 6 0 6
4 4 17609 249 17862
5 0 808 39 847
6 1 1817 176 1994
7 0 1031 14 1045
8 0 10 0 10
All 5 23612 518 24135
In [74]:
# Predict probabilities with the multinomial model
train_df = pd.DataFrame(mnb_gmodel.predict_proba(x_train))
train_df.columns = ['prob' + str(col) for col in train_df.columns]
train_df.head()
Out[74]:
prob0 prob1 prob2 prob3 prob4 prob5 prob6 prob7
0 0.024965 0.064361 4.641681e-05 0.811001 0.024343 0.041992 0.033156 0.000134
1 0.028716 0.056804 9.771999e-05 0.840836 0.009995 0.026740 0.036635 0.000175
2 0.023423 0.063741 4.618472e-05 0.806570 0.026826 0.046238 0.033016 0.000140
3 0.003151 0.021614 5.120310e-03 0.920733 0.002087 0.013613 0.033107 0.000575
4 0.041562 0.087700 2.091052e-07 0.688434 0.056708 0.086109 0.039332 0.000156
In [75]:
y_pred_test = y_test
y_pred_test['CLASS_PREDS'] = mnb_gmodel.predict(x_test)
In [76]:
pd.crosstab(y_pred_test['WorkClass_c'], y_pred_test['CLASS_PREDS'],
            rownames=['True'], colnames=['Predicted'], margins=True)
Out[76]:
Predicted 3 4 6 All
True
1 0 192 5 197
2 0 429 13 442
3 0 1 0 1
4 1 4335 88 4424
5 0 212 15 227
6 0 474 31 505
7 0 230 4 234
8 0 4 0 4
All 1 5877 156 6034
In [77]:
# Predict probabilities with the multinomial model
test_df = pd.DataFrame(mnb_gmodel.predict_proba(x_test))
test_df.columns = ['prob' + str(col) for col in test_df.columns]
test_df.head()
Out[77]:
prob0 prob1 prob2 prob3 prob4 prob5 prob6 prob7
0 0.030156 0.066431 4.351129e-05 0.818368 0.015644 0.033149 0.036066 0.000142
1 0.012796 0.044044 5.801589e-04 0.874783 0.006219 0.023658 0.037662 0.000258
2 0.022166 0.079540 2.297783e-08 0.586430 0.115180 0.163014 0.033463 0.000206
3 0.000159 0.007502 7.578714e-02 0.846854 0.000797 0.025477 0.038261 0.005163
4 0.025551 0.060549 6.829524e-05 0.833097 0.014289 0.030426 0.035873 0.000149
In [78]:
accuracy_score(y_pred_train['WorkClass_c'],  y_pred_train['CLASS_PREDS'])
Out[78]:
0.7368966231613839
In [79]:
accuracy_score(y_pred_test['WorkClass_c'],  y_pred_test['CLASS_PREDS'])
Out[79]:
0.723566456745111
In [80]:
0.7361 - 0.7231
Out[80]:
0.013000000000000012

Bernoulli Naive Bayes Model

The accuracy scores for the Bernoulli model, 0.6341 and 0.6283, for train and test, respectively. This model predicts classes 1, 2, 4, 5, 6, and 7. A model with Age binned and converted to dummy variables resulted in a slight decrease in the training and test accurracy, 0.6253 and 0.6155, respectively. This model with age predicted classes 1, 2, 4, 5, 6, 7, and 8. Because of the slightly better accuracy and adequate number of classes being predicted, the model chosen is the model without Age.

In [81]:
#list(adult_known.columns.values.tolist())
In [82]:
# Create a Bernoulli Naive Bayes model
target = adult_known[['WorkClass_c']]
bern_xvars = adult_known[['Education_10th',
 'Education_11th',
 'Education_12th',
 'Education_1st-4th',
 'Education_5th-6th',
 'Education_7th-8th',
 'Education_9th',
 'Education_Assoc-acdm',
 'Education_Assoc-voc',
 'Education_Bachelors',
 'Education_Doctorate',
 'Education_HS-grad',
 'Education_Masters',
 'Education_Preschool',
 'Education_Prof-school',
 'Education_Some-college',
 'MaritalStatus_Divorced',
 'MaritalStatus_Married-AF-spouse',
 'MaritalStatus_Married-civ-spouse',
 'MaritalStatus_Married-spouse-absent',
 'MaritalStatus_Never-married',
 'MaritalStatus_Separated',
 'MaritalStatus_Widowed',
 'Occupation_Adm-clerical',
 'Occupation_Armed-Forces',
 'Occupation_Craft-repair',
 'Occupation_Exec-managerial',
 'Occupation_Farming-fishing',
 'Occupation_Handlers-cleaners',
 'Occupation_Machine-op-inspct',
 'Occupation_Never-worked',
 'Occupation_Other-service',
 'Occupation_Priv-house-serv',
 'Occupation_Prof-specialty',
 'Occupation_Protective-serv',
 'Occupation_Sales',
 'Occupation_Tech-support',
 'Occupation_Transport-moving',
 'Relationship_Husband',
 'Relationship_Not-in-family',
 'Relationship_Other-relative',
 'Relationship_Own-child',
 'Relationship_Unmarried',
 'Relationship_Wife',
 'Race_Amer-Indian-Eskimo',
 'Race_Asian-Pac-Islander',
 'Race_Black',
 'Race_Other',
 'Race_White',
 'Gender_Female',
 'Gender_Male',
 'NativeCountry_Cambodia',
 'NativeCountry_Canada',
 'NativeCountry_China',
 'NativeCountry_Columbia',
 'NativeCountry_Cuba',
 'NativeCountry_Dominican-Republic',
 'NativeCountry_Ecuador',
 'NativeCountry_El-Salvador',
 'NativeCountry_England',
 'NativeCountry_France',
 'NativeCountry_Germany',
 'NativeCountry_Greece',
 'NativeCountry_Guatemala',
 'NativeCountry_Haiti',
 'NativeCountry_Holand-Netherlands',
 'NativeCountry_Honduras',
 'NativeCountry_Hong',
 'NativeCountry_Hungary',
 'NativeCountry_India',
 'NativeCountry_Iran',
 'NativeCountry_Ireland',
 'NativeCountry_Italy',
 'NativeCountry_Jamaica',
 'NativeCountry_Japan',
 'NativeCountry_Laos',
 'NativeCountry_Mexico',
 'NativeCountry_Nicaragua',
 'NativeCountry_Outlying-US(Guam-USVI-etc)',
 'NativeCountry_Peru',
 'NativeCountry_Philippines',
 'NativeCountry_Poland',
 'NativeCountry_Portugal',
 'NativeCountry_Puerto-Rico',
 'NativeCountry_Scotland',
 'NativeCountry_South',
 'NativeCountry_Taiwan',
 'NativeCountry_Thailand',
 'NativeCountry_Trinadad&Tobago',
 'NativeCountry_United-States',
 'NativeCountry_Vietnam',
 'NativeCountry_Yugoslavia',
 'Income_<=50K',
 'Income_>50K'
 #'age_bins_(10, 19]',
 #'age_bins_(19, 29]',
 #'age_bins_(29, 39]',
 #'age_bins_(39, 49]',
 #'age_bins_(49, 59]',
 #'age_bins_(59, 69]',
 #'age_bins_(69, 79]',
 #'age_bins_(79, 89]',
 #'age_bins_(89, 99]'
                         ]]
In [83]:
x_train, x_test, y_train, y_test = train_test_split(preprocessing.scale(bern_xvars), 
                                                    target, 
                                                    test_size=0.2, 
                                                    random_state=1)
# Scaling the binomial variables did not change the model accuracy
In [84]:
x_train, x_test, y_train, y_test = train_test_split(bern_xvars, 
                                                    target, 
                                                    test_size=0.2, 
                                                    random_state=1)
In [85]:
mnbb_clf = BernoulliNB(
    class_prior = [0.0313, 0.0685, 0.0002, 0.7387, 0.0356, 0.0828, 0.0424, 0.0005]
                              )
In [86]:
mnbb_model = mnbb_clf.fit(x_train, y_train)
In [87]:
y_mnbb_train = y_train.copy()
y_mnbb_train['CLASS_PREDS'] = mnbb_model.predict(x_train)

Confusion Matrix (Train)

In [88]:
pd.crosstab(y_mnbb_train['WorkClass_c'], y_mnbb_train['CLASS_PREDS'],
            rownames=['True'], colnames=['Predicted'], margins=True)
Out[88]:
Predicted 1 2 4 5 6 7 All
True
1 32 28 523 125 32 6 746
2 5 383 925 189 108 15 1625
3 0 1 4 0 1 0 6
4 57 360 13912 2092 1391 50 17862
5 1 14 340 378 107 7 847
6 3 53 1035 332 556 15 1994
7 8 130 653 156 56 42 1045
8 0 0 9 0 1 0 10
All 106 969 17401 3272 2252 135 24135
In [89]:
# Predict probabilities with the multinomial model
bbayes_train = pd.DataFrame(mnbb_model.predict_proba(x_train))
bbayes_train.columns = ['prob' + str(col) for col in bbayes_train.columns]
bbayes_train.head()
Out[89]:
prob0 prob1 prob2 prob3 prob4 prob5 prob6 prob7
0 0.017176 0.016249 2.166075e-11 0.363323 0.197061 0.394509 0.011682 2.494796e-07
1 0.015505 0.007181 6.572325e-11 0.614404 0.197153 0.162052 0.003704 1.021810e-07
2 0.005336 0.001826 5.876449e-09 0.923738 0.012284 0.053312 0.003503 2.397255e-07
3 0.000015 0.000089 1.194468e-07 0.999177 0.000030 0.000610 0.000065 1.450223e-05
4 0.011154 0.037073 3.346436e-09 0.918373 0.000198 0.010838 0.022364 7.677552e-08
In [90]:
confusion_matrix(y_mnbb_train['WorkClass_c'], y_mnbb_train['CLASS_PREDS'])
Out[90]:
array([[   32,    28,     0,   523,   125,    32,     6,     0],
       [    5,   383,     0,   925,   189,   108,    15,     0],
       [    0,     1,     0,     4,     0,     1,     0,     0],
       [   57,   360,     0, 13912,  2092,  1391,    50,     0],
       [    1,    14,     0,   340,   378,   107,     7,     0],
       [    3,    53,     0,  1035,   332,   556,    15,     0],
       [    8,   130,     0,   653,   156,    56,    42,     0],
       [    0,     0,     0,     9,     0,     1,     0,     0]])
In [91]:
y_mnbb_test = y_test.copy()
y_mnbb_test['CLASS_PREDS'] = mnbb_model.predict(x_test)

Confusion Matrix (Test)

In [92]:
pd.crosstab(y_mnbb_test['WorkClass_c'], y_mnbb_test['CLASS_PREDS'],
            rownames=['True'], colnames=['Predicted'], margins=True)
Out[92]:
Predicted 1 2 4 5 6 7 All
True
1 4 19 131 32 9 2 197
2 2 109 238 56 35 2 442
3 0 0 1 0 0 0 1
4 12 72 3430 548 342 20 4424
5 1 2 79 115 28 2 227
6 0 8 276 92 129 0 505
7 1 33 142 40 14 4 234
8 0 0 4 0 0 0 4
All 20 243 4301 883 557 30 6034
In [93]:
# Predict probabilities with the Bernoulli model
bbayes_test = pd.DataFrame(mnbb_model.predict_proba(x_test))
bbayes_test.columns = ['prob' + str(col) for col in bbayes_test.columns]
bbayes_test.head()
Out[93]:
prob0 prob1 prob2 prob3 prob4 prob5 prob6 prob7
0 0.089593 0.135073 1.010547e-09 0.431739 0.139300 0.050128 0.154167 8.347114e-09
1 0.000792 0.000546 7.838645e-08 0.983166 0.002345 0.011528 0.001617 5.527486e-06
2 0.021081 0.046403 1.470271e-11 0.023699 0.627670 0.202226 0.078921 1.039024e-08
3 0.000080 0.000186 1.464137e-07 0.989257 0.000789 0.009229 0.000457 2.011263e-06
4 0.005166 0.010340 2.007746e-11 0.418806 0.035189 0.527572 0.002927 2.312439e-07
In [94]:
confusion_matrix(y_mnbb_test['WorkClass_c'], y_mnbb_test['CLASS_PREDS'])
Out[94]:
array([[   4,   19,    0,  131,   32,    9,    2,    0],
       [   2,  109,    0,  238,   56,   35,    2,    0],
       [   0,    0,    0,    1,    0,    0,    0,    0],
       [  12,   72,    0, 3430,  548,  342,   20,    0],
       [   1,    2,    0,   79,  115,   28,    2,    0],
       [   0,    8,    0,  276,   92,  129,    0,    0],
       [   1,   33,    0,  142,   40,   14,    4,    0],
       [   0,    0,    0,    4,    0,    0,    0,    0]])
In [95]:
accuracy_score(y_mnbb_train['WorkClass_c'],  y_mnbb_train['CLASS_PREDS'])
Out[95]:
0.634058421379739
In [96]:
accuracy_score(y_mnbb_test['WorkClass_c'],  y_mnbb_test['CLASS_PREDS'])
Out[96]:
0.6282731189923765
In [97]:
0.6341-0.6283
Out[97]:
0.005800000000000027
In [98]:
y_mnbb_test['WC1'] = np.where(y_mnbb_test['WorkClass_c'] == 1, 1, 0)
y_mnbb_test['WC2'] = np.where(y_mnbb_test['WorkClass_c'] == 2, 1, 0)
y_mnbb_test['WC3'] = np.where(y_mnbb_test['WorkClass_c'] == 3, 1, 0)
y_mnbb_test['WC4'] = np.where(y_mnbb_test['WorkClass_c'] == 4, 1, 0)
y_mnbb_test['WC5'] = np.where(y_mnbb_test['WorkClass_c'] == 5, 1, 0)
y_mnbb_test['WC6'] = np.where(y_mnbb_test['WorkClass_c'] == 6, 1, 0)
y_mnbb_test['WC7'] = np.where(y_mnbb_test['WorkClass_c'] == 7, 1, 0)
y_mnbb_test['WC8'] = np.where(y_mnbb_test['WorkClass_c'] == 8, 1, 0)
In [99]:
y_mnbb_test.head()
Out[99]:
WorkClass_c CLASS_PREDS WC1 WC2 WC3 WC4 WC5 WC6 WC7 WC8
25150 4 4 0 0 0 1 0 0 0 0
12079 4 4 0 0 0 1 0 0 0 0
96 6 5 0 0 0 0 0 1 0 0
572 4 4 0 0 0 1 0 0 0 0
26565 6 6 0 0 0 0 0 1 0 0
In [100]:
# Visualize these ROC curves with matplotlib

plt.plot(roc_curve(y_mnbb_test['WC1'], bbayes_test['prob0'])[0],roc_curve(y_mnbb_test['WC1'], bbayes_test['prob0'])[1], 
         color = 'blue', label='WorkClass: 1 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnbb_test['WC1'], bbayes_test['prob0']))

plt.plot(roc_curve(y_mnbb_test['WC2'], bbayes_test['prob1'])[0],roc_curve(y_mnbb_test['WC2'], bbayes_test['prob1'])[1], 
         color = 'blue', label='WorkClass: 2 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnbb_test['WC2'], bbayes_test['prob0']))

plt.plot(roc_curve(y_mnbb_test['WC3'], bbayes_test['prob2'])[0],roc_curve(y_mnbb_test['WC3'], bbayes_test['prob2'])[1], 
         color = 'blue', label='WorkClass: 3 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnbb_test['WC3'], bbayes_test['prob0']))

plt.plot(roc_curve(y_mnbb_test['WC4'], bbayes_test['prob3'])[0],roc_curve(y_mnbb_test['WC4'], bbayes_test['prob3'])[1], 
         color = 'red', label='WorkClass: 4 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnbb_test['WC4'], bbayes_test['prob1']))

plt.plot(roc_curve(y_mnbb_test['WC5'], bbayes_test['prob4'])[0],roc_curve(y_mnbb_test['WC5'], bbayes_test['prob4'])[1], 
         color = 'green', label='WorkClass: 5 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnbb_test['WC5'], bbayes_test['prob2']))

plt.plot(roc_curve(y_mnbb_test['WC6'], bbayes_test['prob5'])[0],roc_curve(y_mnbb_test['WC6'], bbayes_test['prob5'])[1], 
         color = 'purple', label='WorkClass: 6 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnbb_test['WC6'], bbayes_test['prob3']))

plt.plot(roc_curve(y_mnbb_test['WC7'], bbayes_test['prob6'])[0],roc_curve(y_mnbb_test['WC7'], bbayes_test['prob6'])[1], 
         color = 'orange', label='WorkClass: 7 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnbb_test['WC7'], bbayes_test['prob4']))

plt.plot(roc_curve(y_mnbb_test['WC8'], bbayes_test['prob7'])[0],roc_curve(y_mnbb_test['WC8'], bbayes_test['prob7'])[1], 
         color = 'brown', label='WorkClass: 8 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnbb_test['WC8'], bbayes_test['prob5']))



plt.plot([0, 2], [0, 2], color='black', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('Multinomial Bayes AUC')
plt.legend()
plt.show()
sns.set(style='white', rc={'figure.figsize':(10,10)})

Question 2: Predict the Working Class with Multinomial Logistic Regression

The multinomial logistic regression model has a higher accuracy on both train and test, 0.7496 and 0.7454, respectively, than the multinomial naive Bayes model with train and test accuracy 0.6341 and 0.6283, respectively. The multinomial logistic regression model predicts classes 1 through 7 on train and 2 through 7 on test. This model did not include binned Age.

Note: When the binned Age variable was included as dummy variables, the accuracy increased slightly for train and test at 0.7506 and 0.7476, respectively. The predicted classes between train and test changed only slightly with test predicting classes 2 through 6.

In [101]:
mlr_target = adult_known[['WorkClass_c']]
mlr_xvars = adult_known[['Education_10th',
 'Education_11th',
 'Education_12th',
 'Education_1st-4th',
 'Education_5th-6th',
 'Education_7th-8th',
 'Education_9th',
 'Education_Assoc-acdm',
 'Education_Assoc-voc',
 'Education_Bachelors',
 'Education_Doctorate',
 'Education_HS-grad',
 'Education_Masters',
 'Education_Preschool',
 'Education_Prof-school',
 'Education_Some-college',
 'MaritalStatus_Divorced',
 'MaritalStatus_Married-AF-spouse',
 'MaritalStatus_Married-civ-spouse',
 'MaritalStatus_Married-spouse-absent',
 'MaritalStatus_Never-married',
 'MaritalStatus_Separated',
 'MaritalStatus_Widowed',
 'Occupation_Adm-clerical',
 'Occupation_Armed-Forces',
 'Occupation_Craft-repair',
 'Occupation_Exec-managerial',
 'Occupation_Farming-fishing',
 'Occupation_Handlers-cleaners',
 'Occupation_Machine-op-inspct',
 'Occupation_Never-worked',
 'Occupation_Other-service',
 'Occupation_Priv-house-serv',
 'Occupation_Prof-specialty',
 'Occupation_Protective-serv',
 'Occupation_Sales',
 'Occupation_Tech-support',
 'Occupation_Transport-moving',
 'Relationship_Husband',
 'Relationship_Not-in-family',
 'Relationship_Other-relative',
 'Relationship_Own-child',
 'Relationship_Unmarried',
 'Relationship_Wife',
 'Race_Amer-Indian-Eskimo',
 'Race_Asian-Pac-Islander',
 'Race_Black',
 'Race_Other',
 'Race_White',
 'Gender_Female',
 'Gender_Male',
 'NativeCountry_Cambodia',
 'NativeCountry_Canada',
 'NativeCountry_China',
 'NativeCountry_Columbia',
 'NativeCountry_Cuba',
 'NativeCountry_Dominican-Republic',
 'NativeCountry_Ecuador',
 'NativeCountry_El-Salvador',
 'NativeCountry_England',
 'NativeCountry_France',
 'NativeCountry_Germany',
 'NativeCountry_Greece',
 'NativeCountry_Guatemala',
 'NativeCountry_Haiti',
 'NativeCountry_Holand-Netherlands',
 'NativeCountry_Honduras',
 'NativeCountry_Hong',
 'NativeCountry_Hungary',
 'NativeCountry_India',
 'NativeCountry_Iran',
 'NativeCountry_Ireland',
 'NativeCountry_Italy',
 'NativeCountry_Jamaica',
 'NativeCountry_Japan',
 'NativeCountry_Laos',
 'NativeCountry_Mexico',
 'NativeCountry_Nicaragua',
 'NativeCountry_Outlying-US(Guam-USVI-etc)',
 'NativeCountry_Peru',
 'NativeCountry_Philippines',
 'NativeCountry_Poland',
 'NativeCountry_Portugal',
 'NativeCountry_Puerto-Rico',
 'NativeCountry_Scotland',
 'NativeCountry_South',
 'NativeCountry_Taiwan',
 'NativeCountry_Thailand',
 'NativeCountry_Trinadad&Tobago',
 'NativeCountry_United-States',
 'NativeCountry_Vietnam',
 'NativeCountry_Yugoslavia',
 'Income_<=50K',
 'Income_>50K'
 #'age_bins_(10, 19]',
 #'age_bins_(19, 29]',
 #'age_bins_(29, 39]',
 #'age_bins_(39, 49]',
 #'age_bins_(49, 59]',
 #'age_bins_(59, 69]',
 #'age_bins_(69, 79]',
 #'age_bins_(79, 89]',
 #'age_bins_(89, 99]'
                         ]]
In [102]:
x_train, x_test, y_train, y_test = train_test_split(mlr_xvars, 
                                                    mlr_target, 
                                                    test_size=0.2, 
                                                    random_state=1)
In [103]:
#mnlr = LogisticRegression(random_state=0, multi_class='multinomial', solver='newton-cg')
# Multinomial logistic regression with ridge
mnlr = LogisticRegression(random_state=0, multi_class='multinomial', solver='newton-cg', penalty = 'l2',
                         C = 2, max_iter = 100)
In [104]:
mnlr_model = mnlr.fit(x_train, y_train)
In [105]:
y_mnlr_train = y_train.copy()
y_mnlr_train['CLASS_PREDS'] = mnlr_model.predict(x_train)

Confusion Matrix (Train)

In [106]:
pd.crosstab(y_mnlr_train['WorkClass_c'], y_mnlr_train['CLASS_PREDS'],
            rownames=['True'], colnames=['Predicted'], margins=True)
Out[106]:
Predicted 1 2 3 4 5 6 7 All
True
1 9 17 0 716 1 3 0 746
2 0 226 0 1389 0 10 0 1625
3 0 0 6 0 0 0 0 6
4 0 133 0 17579 3 143 4 17862
5 0 4 0 811 4 28 0 847
6 0 5 0 1723 1 265 0 1994
7 1 85 0 947 0 10 2 1045
8 0 0 0 8 0 2 0 10
All 10 470 6 23173 9 461 6 24135
In [107]:
# Predict probabilities with the multinomial logistic model
mnlr_train = pd.DataFrame(mnlr_model.predict_proba(x_train))
mnlr_train.columns = ['prob' + str(col) for col in mnlr_train.columns]
mnlr_train.head()
Out[107]:
prob0 prob1 prob2 prob3 prob4 prob5 prob6 prob7
0 0.033591 0.035926 0.000052 0.642632 0.094867 0.152238 0.040451 0.000242
1 0.008241 0.004468 0.000008 0.956147 0.008932 0.018682 0.003503 0.000018
2 0.006988 0.002046 0.000081 0.751021 0.095285 0.140994 0.003557 0.000028
3 0.000183 0.000893 0.000026 0.978719 0.010589 0.009170 0.000372 0.000049
4 0.012714 0.048602 0.000101 0.822402 0.009825 0.084686 0.021649 0.000022
In [108]:
y_mnlr_test = y_test.copy()
y_mnlr_test['CLASS_PREDS'] = mnlr_model.predict(x_test)

Confusion Matrix (Test)

In [109]:
pd.crosstab(y_mnlr_test['WorkClass_c'], y_mnlr_test['CLASS_PREDS'],
            rownames=['True'], colnames=['Predicted'], margins=True)
Out[109]:
Predicted 2 3 4 5 6 All
True
1 11 0 184 0 2 197
2 65 0 375 0 2 442
3 0 1 0 0 0 1
4 24 0 4366 0 34 4424
5 1 0 218 0 8 227
6 1 0 437 1 66 505
7 24 0 210 0 0 234
8 0 0 4 0 0 4
All 126 1 5794 1 112 6034
In [110]:
# Predict probabilities with the multinomial logistic model
mnlr_test = pd.DataFrame(mnlr_model.predict_proba(x_test))
mnlr_test.columns = ['prob' + str(col) for col in mnlr_test.columns]
mnlr_test.head()
Out[110]:
prob0 prob1 prob2 prob3 prob4 prob5 prob6 prob7
0 0.054292 0.067534 0.000060 0.672798 0.070697 0.063730 0.070885 0.000004
1 0.002560 0.001430 0.000220 0.890881 0.046118 0.055444 0.002965 0.000383
2 0.052399 0.094085 0.000034 0.382974 0.083537 0.159736 0.227208 0.000027
3 0.000640 0.000809 0.000260 0.907213 0.035553 0.053917 0.001549 0.000059
4 0.013594 0.036024 0.000035 0.736210 0.025635 0.173247 0.015078 0.000177
In [111]:
accuracy_score(y_mnlr_train['WorkClass_c'],  y_mnlr_train['CLASS_PREDS'])
Out[111]:
0.7495753055728196
In [112]:
accuracy_score(y_mnlr_test['WorkClass_c'],  y_mnlr_test['CLASS_PREDS'])
Out[112]:
0.7454424925422605
In [113]:
y_mnlr_test['WC1'] = np.where(y_mnlr_test['WorkClass_c'] == 1, 1, 0)
y_mnlr_test['WC2'] = np.where(y_mnlr_test['WorkClass_c'] == 2, 1, 0)
y_mnlr_test['WC3'] = np.where(y_mnlr_test['WorkClass_c'] == 3, 1, 0)
y_mnlr_test['WC4'] = np.where(y_mnlr_test['WorkClass_c'] == 4, 1, 0)
y_mnlr_test['WC5'] = np.where(y_mnlr_test['WorkClass_c'] == 5, 1, 0)
y_mnlr_test['WC6'] = np.where(y_mnlr_test['WorkClass_c'] == 6, 1, 0)
y_mnlr_test['WC7'] = np.where(y_mnlr_test['WorkClass_c'] == 7, 1, 0)
y_mnlr_test['WC8'] = np.where(y_mnlr_test['WorkClass_c'] == 8, 1, 0)
In [114]:
# Visualize these ROC curves with matplotlib

plt.plot(roc_curve(y_mnlr_test['WC1'], mnlr_test['prob0'])[0],roc_curve(y_mnlr_test['WC1'], mnlr_test['prob0'])[1], 
         color = 'blue', label='WorkClass: 1 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnlr_test['WC1'], mnlr_test['prob0']))

plt.plot(roc_curve(y_mnlr_test['WC2'], mnlr_test['prob1'])[0],roc_curve(y_mnlr_test['WC2'], mnlr_test['prob1'])[1], 
         color = 'blue', label='WorkClass: 2 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnlr_test['WC2'], mnlr_test['prob0']))

plt.plot(roc_curve(y_mnlr_test['WC3'], mnlr_test['prob2'])[0],roc_curve(y_mnlr_test['WC3'], mnlr_test['prob2'])[1], 
         color = 'blue', label='WorkClass: 3 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnlr_test['WC3'], mnlr_test['prob0']))

plt.plot(roc_curve(y_mnlr_test['WC4'], mnlr_test['prob3'])[0],roc_curve(y_mnlr_test['WC4'], mnlr_test['prob3'])[1], 
         color = 'red', label='WorkClass: 4 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnlr_test['WC4'], mnlr_test['prob1']))

plt.plot(roc_curve(y_mnlr_test['WC5'], mnlr_test['prob4'])[0],roc_curve(y_mnlr_test['WC5'], mnlr_test['prob4'])[1], 
         color = 'green', label='WorkClass: 5 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnlr_test['WC5'], mnlr_test['prob2']))

plt.plot(roc_curve(y_mnlr_test['WC6'], mnlr_test['prob5'])[0],roc_curve(y_mnlr_test['WC6'], mnlr_test['prob5'])[1], 
         color = 'purple', label='WorkClass: 6 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnlr_test['WC6'], mnlr_test['prob3']))

plt.plot(roc_curve(y_mnlr_test['WC7'], mnlr_test['prob6'])[0],roc_curve(y_mnlr_test['WC7'], mnlr_test['prob6'])[1], 
         color = 'orange', label='WorkClass: 7 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnlr_test['WC7'], mnlr_test['prob4']))

plt.plot(roc_curve(y_mnlr_test['WC8'], mnlr_test['prob7'])[0],roc_curve(y_mnlr_test['WC8'], mnlr_test['prob7'])[1], 
         color = 'brown', label='WorkClass: 8 ROC Curve (area = %0.5f)' % roc_auc_score(y_mnlr_test['WC8'], mnlr_test['prob5']))



plt.plot([0, 2], [0, 2], color='black', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('Multinomial Logistic AUC')
plt.legend()
plt.show()
sns.set(style='white', rc={'figure.figsize':(10,10)})

ROC Curves

The ROC curves show how well the model does in terms of the sensitivity and specificity. Sensitivity is represented on the y-axis, and the complement of specificity (1 - specificity) is represented on the x-axis. As sensitivity increases, specificity decreases so a model that is highly accurate will have an ROC curve that closely follows the left and top margins of the graph. The area under the curve (AUC) is a measure used to evaluate the model: a high AUC indicates high sensitivity and low specificity, which in turn indicates an accurate model.

In this multinomial model, each curve represents the trade-off between sensitivity and specificity for each WorkClass_c outcome. According to the graph, the green line (WorkClass_c = 5) appears to have the largest area because it follows the left and top margins better than the other curves. However, the AUC for the green line, 0.3035, is the lowest of all of the AUC measures.

Question 3: Predict the Income with Random Forests

The Random Forest Classifier method, without parameter tuning, overfit the data with a train and test accuracy of 0.9248 and 0.8560, respectively. After using RandomizedSearchCV to choose the hyperparamter values, the accuracy for train dropped slightly to 0.8991, but the accuracy for test jumped up to 0.8810. When the binned Age dummy variables were included, the accuracy on train and test, with hyperparameter tuning, was 0.9139 and 0.8903, respectively.

The hyperparamter 'min_samples_split' effected the accuracy when changed from 10, the value selected by RandomizedSearchCV, to 20, 50, and 100. The difference between the train and test accuracy values stayed between 0.01 and 0.02, however, there was a drop in accuracy.

min_samples_split=20 train accuracy: 0.8811 test accuracy: 0.8791

min_samples_split=50 train accuracy: 0.9027 test accuracy: 0.8907

min_samples_split=100 train accuracy: 0.8969 test accuracy: 0.8898

One interesting thing about the above values is that, of the three examples, the hyperparameter value of 50 has a train and test accuracy that is higher than the respective train and test accuracy values produced by the hyperparameter values 20 and 100.

In [115]:
#list(adult_known.columns.values.tolist())
In [116]:
rfor_target = adult_known[['Income_>50K']]
rfor_xvars = adult_known[['WorkClass_Federal-gov',
 'WorkClass_Local-gov',
 'WorkClass_Never-worked',
 'WorkClass_Private',
 'WorkClass_Self-emp-inc',
 'WorkClass_Self-emp-not-inc',
 'WorkClass_State-gov',
 'WorkClass_Without-pay', 
 'Education_10th',
 'Education_11th',
 'Education_12th',
 'Education_1st-4th',
 'Education_5th-6th',
 'Education_7th-8th',
 'Education_9th',
 'Education_Assoc-acdm',
 'Education_Assoc-voc',
 'Education_Bachelors',
 'Education_Doctorate',
 'Education_HS-grad',
 'Education_Masters',
 'Education_Preschool',
 'Education_Prof-school',
 'Education_Some-college',
 'MaritalStatus_Divorced',
 'MaritalStatus_Married-AF-spouse',
 'MaritalStatus_Married-civ-spouse',
 'MaritalStatus_Married-spouse-absent',
 'MaritalStatus_Never-married',
 'MaritalStatus_Separated',
 'MaritalStatus_Widowed',
 'Occupation_Adm-clerical',
 'Occupation_Armed-Forces',
 'Occupation_Craft-repair',
 'Occupation_Exec-managerial',
 'Occupation_Farming-fishing',
 'Occupation_Handlers-cleaners',
 'Occupation_Machine-op-inspct',
 'Occupation_Never-worked',
 'Occupation_Other-service',
 'Occupation_Priv-house-serv',
 'Occupation_Prof-specialty',
 'Occupation_Protective-serv',
 'Occupation_Sales',
 'Occupation_Tech-support',
 'Occupation_Transport-moving',
 'Relationship_Husband',
 'Relationship_Not-in-family',
 'Relationship_Other-relative',
 'Relationship_Own-child',
 'Relationship_Unmarried',
 'Relationship_Wife',
 'Race_Amer-Indian-Eskimo',
 'Race_Asian-Pac-Islander',
 'Race_Black',
 'Race_Other',
 'Race_White',
 'Gender_Female',
 'Gender_Male',
 'NativeCountry_Cambodia',
 'NativeCountry_Canada',
 'NativeCountry_China',
 'NativeCountry_Columbia',
 'NativeCountry_Cuba',
 'NativeCountry_Dominican-Republic',
 'NativeCountry_Ecuador',
 'NativeCountry_El-Salvador',
 'NativeCountry_England',
 'NativeCountry_France',
 'NativeCountry_Germany',
 'NativeCountry_Greece',
 'NativeCountry_Guatemala',
 'NativeCountry_Haiti',
 'NativeCountry_Holand-Netherlands',
 'NativeCountry_Honduras',
 'NativeCountry_Hong',
 'NativeCountry_Hungary',
 'NativeCountry_India',
 'NativeCountry_Iran',
 'NativeCountry_Ireland',
 'NativeCountry_Italy',
 'NativeCountry_Jamaica',
 'NativeCountry_Japan',
 'NativeCountry_Laos',
 'NativeCountry_Mexico',
 'NativeCountry_Nicaragua',
 'NativeCountry_Outlying-US(Guam-USVI-etc)',
 'NativeCountry_Peru',
 'NativeCountry_Philippines',
 'NativeCountry_Poland',
 'NativeCountry_Portugal',
 'NativeCountry_Puerto-Rico',
 'NativeCountry_Scotland',
 'NativeCountry_South',
 'NativeCountry_Taiwan',
 'NativeCountry_Thailand',
 'NativeCountry_Trinadad&Tobago',
 'NativeCountry_United-States',
 'NativeCountry_Vietnam',
 'NativeCountry_Yugoslavia',
 'age_bins_(10, 19]',
 'age_bins_(19, 29]',
 'age_bins_(29, 39]',
 'age_bins_(39, 49]',
 'age_bins_(49, 59]',
 'age_bins_(59, 69]',
 'age_bins_(69, 79]',
 'age_bins_(79, 89]',
 'age_bins_(89, 99]'
                         ]]
In [117]:
x_train, x_test, y_train, y_test = train_test_split(rfor_xvars, 
                                                    rfor_target, 
                                                    test_size=0.2, 
                                                    random_state=1)
In [118]:
freq = pd.crosstab(adult_known['Income_>50K'], columns='count')
freq
Out[118]:
col_0 count
Income_>50K
0 22661
1 7508
In [119]:
# Obtain distribution of Income
labels = adult_known['Income_>50K'].astype('category').cat.categories.tolist()
freq = pd.crosstab(adult_known['Income_>50K'], columns='proportion').apply(lambda r: r/len(adult_known), axis=1)
freq.head(len(labels))
Out[119]:
col_0 proportion
Income_>50K
0 0.751135
1 0.248865
In [120]:
# Random forest classifier
rf_clf = RandomForestClassifier(random_state = 0, n_jobs = -1)
In [121]:
rf_clf_model = rf_clf.fit(x_train, y_train)
In [122]:
rf_train_probs = rf_clf_model.predict_proba(x_train)
rf_test_probs = rf_clf_model.predict_proba(x_test)
In [123]:
rf_train_prob_cols = ['prob0', 'prob1']
In [124]:
rf_preds_train = pd.DataFrame(rf_train_probs, columns=rf_train_prob_cols)
rf_preds_test = pd.DataFrame(rf_test_probs, columns=rf_train_prob_cols)
In [127]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, rf_preds_train['prob1'])
metrics.auc(fpr, tpr)
Out[127]:
0.9485018797309471
In [128]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, rf_preds_test['prob1'])
metrics.auc(fpr, tpr)
Out[128]:
0.8536613299563438
In [483]:
# Look at current hyperparameters
print('Parameters currently in use:\n')
pprint(rf_clf.get_params())
Parameters currently in use:

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

n_estimators = number of trees in the forest

max_features = max number of features considered for splitting a node

max_depth = max number of levels in each decision tree, None means expands until leaf purity or all leaves contain < min_samples_split samples

min_samples_split = minimum number of data points placed in a node before the node is split

min_samples_leaf = minimum number of samples required to be at a leaf node

bootstrap = method for sampling data points (with or without replacement)

class_weight = weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one

crterion = quality measure - gini or entropy

min_impurity_decrease = node will be split if this split induces a decrease of the impurity greater than or equal to this value

min_weight_fraction_leaf = minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node

In [484]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Method of measuring quality
criterion = ['gini', 'entropy']
# Set minimum impurity threshold
min_impurity_decrease = [0.0, 0.5, 1.0, 2.0, 5.0]
# 
min_weight_fraction_leaf = [0, 0.25, 0.5]
# 
class_weight = [{0:0.5, 1:0.5}, {0:0.75, 1:0.25}]


# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion': criterion,
               'min_impurity_decrease': min_impurity_decrease,
               'min_weight_fraction_leaf': min_weight_fraction_leaf,
               'class_weight': class_weight}

pprint(random_grid)
{'bootstrap': [True, False],
 'class_weight': [{0: 0.5, 1: 0.5}, {0: 0.75, 1: 0.25}],
 'criterion': ['gini', 'entropy'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt', 'log2'],
 'min_impurity_decrease': [0.0, 0.5, 1.0, 2.0, 5.0],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'min_weight_fraction_leaf': [0, 0.25, 0.5],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
In [485]:
rf_clf = RandomForestClassifier()
In [211]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf_clf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
In [212]:
# Fit the random search model
rf_random.fit(x_train, y_train)
Fitting 3 folds for each of 100 candidates, totalling 300 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 14.9min finished
Out[212]:
RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None,
                                                    oob_sc...
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_impurity_decrease': [0.0, 0.5, 1.0,
                                                                  2.0, 5.0],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'min_weight_fraction_leaf': [0, 0.25,
                                                                     0.5],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   pre_dispatch='2*n_jobs', random_state=42, refit=True,
                   return_train_score=False, scoring=None, verbose=2)
In [213]:
# View the best parameters
rf_random.best_params_
Out[213]:
{'n_estimators': 1200,
 'min_weight_fraction_leaf': 0,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'min_impurity_decrease': 0.0,
 'max_features': 'auto',
 'max_depth': 70,
 'criterion': 'entropy',
 'class_weight': {0: 0.5, 1: 0.5},
 'bootstrap': False}
In [129]:
rf_clf = RandomForestClassifier(random_state = 0
                                , n_jobs = -1
                                ,n_estimators = 1200
                                ,criterion = 'entropy'
                                ,max_features = 'auto'
                                , max_depth = 70
                                ,min_samples_split = 100
                                ,min_samples_leaf = 2
                                ,min_weight_fraction_leaf = 0
                                ,min_impurity_decrease = 0.0
                                ,class_weight = {0: 0.5, 1: 0.5}
                               )
In [130]:
rf_clf = RandomForestClassifier(random_state = 0
                                , n_jobs = -1
                                ,n_estimators = 1200
                                ,criterion = 'entropy'
                                ,max_features = 'auto'
                                , max_depth = 70
                                ,min_samples_split = 10
                                ,min_samples_leaf = 2
                                ,min_weight_fraction_leaf = 0
                                ,min_impurity_decrease = 0.0
                                ,class_weight = {0: 0.5, 1: 0.5}
                               )
In [131]:
rf_clf_model = rf_clf.fit(x_train, y_train)
In [132]:
rf_train_probs = rf_clf_model.predict_proba(x_train)
rf_test_probs = rf_clf_model.predict_proba(x_test)
In [133]:
rf_train_prob_cols = ['prob0', 'prob1']
In [134]:
rf_preds_train = pd.DataFrame(rf_train_probs, columns=rf_train_prob_cols)
rf_preds_test = pd.DataFrame(rf_test_probs, columns=rf_train_prob_cols)
In [135]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, rf_preds_train['prob1'])
metrics.auc(fpr, tpr)
Out[135]:
0.9139011374200694
In [136]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, rf_preds_test['prob1'])
metrics.auc(fpr, tpr)
Out[136]:
0.8902690671285068
In [137]:
0.9139-0.8903
Out[137]:
0.023600000000000065

Parts IV - V

In [138]:
import warnings;
warnings.filterwarnings('ignore');
In [139]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
import seaborn as sns
from matplotlib import pyplot
import matplotlib.pyplot as plt
import matplotlib
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import plotly.express as px
from sklearn import metrics
from sklearn.manifold import TSNE
import umap

Pitcher_war.csv

In [140]:
# Read in csv file to dataframe
pd.set_option('display.max_columns', None)
pw = pd.read_csv('pitcher_war.csv')

# Display the head
pw.head()
Out[140]:
Season Name Team W L SV G GS IP K/9 BB/9 HR/9 BABIP LOB% GB% HR/FB ERA FIP xFIP WAR Age ERA.1 CG ShO SV.1 BS IP.1 TBF H R ER HR BB IBB HBP WP BK SO GB FB LD IFFB Balls Strikes Pitches RS IFH BU BUH K/9.1 BB/9.1 K/BB H/9 HR/9.1 AVG WHIP BABIP.1 LOB%.1 FIP.1 GB/FB LD% GB%.1 FB% IFFB% HR/FB.1 IFH% BUH% Starting Start-IP tERA xFIP.1 WPA RE24 REW pLI inLI gmLI exLI Pulls WPA/LI Clutch FB%.1 FBv SL% SLv CT% CTv CB% CBv CH% CHv SF% SFv KN% KNv XX% PO% wFB wSL wCT wCB wCH wSF wKN wFB/C wSL/C wCT/C wCB/C wCH/C wSF/C wKN/C O-Swing% Z-Swing% Swing% O-Contact% Z-Contact% Contact% Zone% F-Strike% SwStr% HLD SD MD ERA- FIP- xFIP- K% BB% SIERA RS/9 E-F FA% (pfx) FT% (pfx) O-Swing% (pfx) Z-Swing% (pfx) Swing% (pfx) O-Contact% (pfx) Z-Contact% (pfx) Contact% (pfx) Zone% (pfx) Pace RA9-WAR BIP-Wins LOB-Wins FDP-Wins Age Rng K-BB% Pull% Cent% Oppo% Soft% Med% Hard% kwERA TTO% CH% (pi) O-Swing% (pi) Z-Swing% (pi) Swing% (pi) O-Contact% (pi) Z-Contact% (pi) Contact% (pi) Zone% (pi) Pace (pi) FRM K/9+ BB/9+ K/BB+ H/9+ HR/9+ AVG+ WHIP+ BABIP+ LOB%+ K%+ BB%+ LD%+ GB%+ FB%+ HR/FB%+ Pull%+ Cent%+ Oppo%+ Soft%+ Med%+ Hard%+ playerid
0 2015 Zack Greinke Dodgers 19 3 0 32 32 222.2 8.08 1.62 0.57 0.229 86.50% 48.00% 7.30% 1.66 2.76 3.22 5.3 31 1.66 1 0 0 0 222.2 843 148 43 41 14 40 1 5 7 0 200 281 193 112 18 1161 2078 3239 101 12 12 0 8.08 1.62 5.00 5.98 0.57 0.185 0.84 0.229 86.50% 2.76 1.46 19.10% 48.00% 32.90% 9.30% 7.30% 4.30% 0.00% 46.9 222.2 2.81 3.22 6.76 56.24 6.24 1.06 1.03 0.87 1.22 31 5.47 0.90 50.70% 91.8 19.30% 86.9 NaN NaN 9.10% 73.6 20.90% 88.5 NaN NaN NaN NaN 0.10% NaN 20.0 18.6 NaN -0.5 19.8 NaN NaN 1.22 2.98 NaN -0.18 2.92 NaN NaN 33.30% 69.10% 47.50% 60.60% 85.10% 74.80% 39.90% 64.10% 12.00% 0 0 0 44 73 83 23.70% 4.70% 3.27 4.08 -1.10 43.20% 10.00% 33.30% 64.90% 47.50% 57.80% 85.50% 74.80% 45.00% 22.9 9.4 4.0 0.1 4.1 31 - 31 19.00% 38.60% 36.10% 25.30% 21.70% 51.50% 26.80% 3.21 30.10% 21.10% 33.60% 65.20% 47.50% 58.00% 85.90% 74.90% 44.10% 23.4 3.9 103 55 187 68 58 74 65 77 118 115 61 91 104 101 64 102 101 96 117 99 91 1943
1 2018 Jacob deGrom Mets 10 9 0 32 32 217.0 11.16 1.91 0.41 0.281 82.00% 46.40% 6.30% 1.70 1.99 2.60 9.0 30 1.70 1 0 0 0 217.0 835 152 48 41 10 46 3 5 2 0 269 235 160 111 26 999 2213 3212 86 16 9 2 11.16 1.91 5.85 6.30 0.41 0.194 0.91 0.281 82.00% 1.99 1.47 21.90% 46.40% 31.60% 16.30% 6.30% 6.80% 22.20% 75.7 217.0 2.16 2.60 5.85 52.57 5.70 1.14 1.06 0.88 1.01 31 4.62 0.52 52.10% 96.0 23.90% 91.1 NaN NaN 7.90% 82.7 16.10% 89.0 NaN NaN NaN NaN 0.30% NaN 23.9 15.9 NaN 4.6 12.8 NaN NaN 1.43 2.08 NaN 1.81 2.48 NaN NaN 37.90% 68.80% 51.70% 57.70% 79.70% 70.80% 44.60% 66.50% 15.10% 0 0 0 45 49 64 32.20% 5.50% 2.78 3.57 -0.29 43.50% 8.90% 36.80% 66.30% 51.70% 53.90% 79.90% 70.70% 50.70% 21.3 9.6 0.4 0.1 0.5 30 - 30 26.70% 42.30% 32.00% 25.60% 25.20% 48.20% 26.60% 2.60 38.90% 16.10% 36.70% 66.10% 51.80% 52.40% 80.40% 70.80% 51.30% 21.9 -2.1 130 58 226 75 38 80 70 96 112 144 64 101 106 92 50 106 92 101 138 104 76 10954
2 2015 Jake Arrieta Cubs 22 6 0 33 33 229.0 9.28 1.89 0.39 0.246 80.00% 56.20% 7.80% 1.77 2.35 2.61 7.0 29 1.77 4 3 0 0 229.0 870 150 52 45 10 48 2 6 6 0 236 318 129 119 12 1202 2236 3438 118 22 14 3 9.28 1.89 4.92 5.90 0.39 0.184 0.86 0.246 80.00% 2.35 2.47 21.00% 56.20% 22.80% 9.30% 7.80% 6.90% 21.40% 60.0 229.0 2.77 2.61 5.93 56.13 6.04 0.92 0.92 0.87 0.83 29 5.78 0.67 50.70% 94.6 NaN NaN 29.10% 90.3 15.40% 80.7 4.80% 89.0 NaN NaN NaN NaN 0.20% NaN 23.7 NaN 23.5 7.7 2.8 NaN NaN 1.36 NaN 2.35 1.45 1.74 NaN NaN 34.20% 63.00% 46.90% 61.30% 86.70% 76.30% 44.10% 60.20% 11.10% 0 0 0 45 61 68 27.10% 5.50% 2.75 4.64 -0.58 15.70% NaN 32.80% 62.30% 47.00% 59.00% 85.90% 76.10% 47.90% 22.7 9.1 2.9 -0.8 2.1 29 - 29 21.60% 34.30% 37.80% 27.90% 22.80% 55.20% 22.10% 2.90 33.80% 4.70% 33.00% 61.80% 46.90% 59.50% 85.60% 76.20% 48.40% 23.3 2.2 118 64 184 67 40 73 67 82 110 131 71 100 121 70 68 90 106 106 123 106 75 4153
3 2014 Clayton Kershaw Dodgers 21 3 0 27 27 198.1 10.85 1.41 0.41 0.278 81.60% 51.80% 6.60% 1.77 1.81 2.08 7.9 26 1.77 6 2 0 0 198.1 749 139 42 39 9 31 0 2 7 2 239 243 137 89 19 835 1887 2722 124 12 8 1 10.85 1.41 7.71 6.31 0.41 0.194 0.86 0.278 81.60% 1.81 1.77 19.00% 51.80% 29.20% 13.90% 6.60% 4.90% 12.50% 62.5 198.1 1.84 2.08 5.04 43.95 4.92 0.99 0.95 0.86 0.92 21 4.97 0.14 55.40% 93.0 29.40% 87.6 NaN NaN 14.30% 74.1 0.90% 88.0 NaN NaN NaN NaN NaN NaN 19.2 21.5 NaN 7.4 -1.9 NaN NaN 1.27 2.69 NaN 1.91 -7.46 NaN NaN 36.50% 68.60% 52.50% 55.60% 82.20% 72.90% 49.70% 68.80% 14.20% 0 0 0 51 49 56 31.90% 4.10% 2.09 5.63 -0.04 57.50% NaN 35.40% 67.90% 52.50% 50.30% 83.70% 73.00% 52.60% 23.0 8.2 0.8 -0.5 0.2 26 - 26 27.80% 43.80% 34.80% 21.40% 24.50% 51.20% 24.30% 1.94 37.30% 1.10% 37.10% 67.00% 52.50% 53.60% 83.10% 73.00% 51.50% 23.7 -2.2 139 49 286 74 49 79 68 94 111 155 54 91 113 88 68 111 100 82 133 98 82 2036
4 2013 Clayton Kershaw Dodgers 16 9 0 33 33 236.0 8.85 1.98 0.42 0.251 80.60% 46.00% 5.80% 1.83 2.39 2.88 7.2 25 1.83 3 2 0 0 236.0 908 164 55 48 11 52 2 3 12 2 232 278 189 137 18 1142 2286 3428 108 17 15 3 8.85 1.98 4.46 6.25 0.42 0.192 0.92 0.251 80.60% 2.39 1.47 22.70% 46.00% 31.30% 9.50% 5.80% 6.10% 20.00% 59.8 236.0 2.87 2.88 4.90 47.70 5.36 0.99 0.95 0.88 1.11 30 5.45 -0.51 60.70% 92.6 24.50% 85.1 NaN NaN 12.50% 73.7 2.40% 85.2 NaN NaN NaN NaN 0.10% NaN 37.8 5.3 NaN 11.4 -0.6 NaN NaN 1.82 0.64 NaN 2.67 -0.70 NaN NaN 32.50% 65.80% 48.40% 60.90% 84.70% 76.30% 47.60% 65.10% 11.40% 0 0 0 51 64 76 25.60% 5.70% 2.99 4.12 -0.56 60.70% NaN 33.10% 63.70% 48.50% 55.00% 87.00% 76.10% 50.20% 22.4 8.9 2.5 -0.8 1.7 25 - 25 19.80% 43.10% 35.40% 21.50% 14.40% 56.70% 28.90% 2.92 32.50% 2.30% 33.90% 63.50% 48.50% 56.30% 87.30% 76.30% 49.20% 23.4 -0.6 118 67 177 73 47 78 71 86 109 130 73 106 101 95 57 114 97 84 86 106 97 2036
In [141]:
pw['Team'].unique()
Out[141]:
array(['Dodgers', 'Mets', 'Cubs', 'Rays', 'Mariners', 'White Sox',
       'Marlins', 'Indians', 'Reds', 'Red Sox', 'Phillies', 'Cardinals',
       'Tigers', 'Angels', 'Nationals', '- - -', 'Astros', 'Braves',
       'Pirates', 'Athletics', 'Giants', 'Padres', 'Rangers', 'Rockies',
       'Diamondbacks', 'Blue Jays', 'Yankees', 'Royals', 'Twins',
       'Orioles', 'Brewers'], dtype=object)
In [142]:
pw_ = pw[pw['Team'] == '- - -']
pw_
Out[142]:
Season Name Team W L SV G GS IP K/9 BB/9 HR/9 BABIP LOB% GB% HR/FB ERA FIP xFIP WAR Age ERA.1 CG ShO SV.1 BS IP.1 TBF H R ER HR BB IBB HBP WP BK SO GB FB LD IFFB Balls Strikes Pitches RS IFH BU BUH K/9.1 BB/9.1 K/BB H/9 HR/9.1 AVG WHIP BABIP.1 LOB%.1 FIP.1 GB/FB LD% GB%.1 FB% IFFB% HR/FB.1 IFH% BUH% Starting Start-IP tERA xFIP.1 WPA RE24 REW pLI inLI gmLI exLI Pulls WPA/LI Clutch FB%.1 FBv SL% SLv CT% CTv CB% CBv CH% CHv SF% SFv KN% KNv XX% PO% wFB wSL wCT wCB wCH wSF wKN wFB/C wSL/C wCT/C wCB/C wCH/C wSF/C wKN/C O-Swing% Z-Swing% Swing% O-Contact% Z-Contact% Contact% Zone% F-Strike% SwStr% HLD SD MD ERA- FIP- xFIP- K% BB% SIERA RS/9 E-F FA% (pfx) FT% (pfx) O-Swing% (pfx) Z-Swing% (pfx) Swing% (pfx) O-Contact% (pfx) Z-Contact% (pfx) Contact% (pfx) Zone% (pfx) Pace RA9-WAR BIP-Wins LOB-Wins FDP-Wins Age Rng K-BB% Pull% Cent% Oppo% Soft% Med% Hard% kwERA TTO% CH% (pi) O-Swing% (pi) Z-Swing% (pi) Swing% (pi) O-Contact% (pi) Z-Contact% (pi) Contact% (pi) Zone% (pi) Pace (pi) FRM K/9+ BB/9+ K/BB+ H/9+ HR/9+ AVG+ WHIP+ BABIP+ LOB%+ K%+ BB%+ LD%+ GB%+ FB%+ HR/FB%+ Pull%+ Cent%+ Oppo%+ Soft%+ Med%+ Hard%+ playerid
32 2015 David Price - - - 18 5 0 32 32 220.1 9.19 1.92 0.69 0.290 78.60% 40.40% 7.80% 2.45 2.78 3.24 6.7 29 2.45 3 1 0 0 220.1 888 190 70 60 17 47 2 3 4 0 225 243 219 139 24 1066 2322 3388 137 17 12 3 9.19 1.92 4.79 7.76 0.69 0.227 1.08 0.290 78.60% 2.78 1.11 23.10% 40.40% 36.40% 11.00% 7.80% 7.00% 25.00% 58.3 220.1 3.36 3.24 3.70 36.42 3.80 0.98 0.93 0.85 1.24 29 3.41 0.35 53.40% 94.2 NaN NaN 15.90% 89.6 8.00% 79.7 22.70% 85.2 NaN NaN NaN NaN 0.40% NaN 19.7 NaN -0.5 1.5 10.6 NaN NaN 1.09 NaN -0.09 0.57 1.38 NaN NaN 33.90% 68.60% 50.80% 66.70% 81.70% 76.60% 48.80% 67.00% 11.90% 0 0 0 60 67 80 25.30% 5.30% 3.27 5.60 -0.32 32.30% 22.20% 32.90% 66.00% 50.70% 63.50% 82.00% 76.40% 53.90% 26.0 7.4 0.2 0.5 0.7 29 - 29 20.10% 33.30% 36.70% 30.00% 17.00% 54.80% 28.20% 3.08 32.60% 22.70% 33.30% 65.50% 50.70% 62.70% 82.40% 76.50% 54.10% 26.3 -1.9 120 66 181 89 66 91 83 99 108 126 70 112 91 104 68 83 106 120 90 104 100 3184
33 2014 Jon Lester - - - 16 11 0 32 32 219.2 9.01 1.97 0.66 0.299 76.10% 42.40% 7.20% 2.46 2.80 3.10 5.4 30 2.46 1 1 0 0 219.2 885 194 76 60 16 48 0 5 3 0 220 253 221 123 30 1214 2279 3493 110 27 15 6 9.01 1.97 4.58 7.95 0.66 0.233 1.10 0.299 76.10% 2.80 1.14 20.60% 42.40% 37.00% 13.60% 7.20% 10.70% 40.00% 46.4 219.2 3.12 3.10 3.19 29.10 3.10 0.94 0.89 0.88 0.92 31 2.72 0.68 50.20% 91.8 NaN NaN 30.80% 87.8 16.40% 75.1 2.60% 84.8 NaN NaN NaN NaN NaN NaN 6.0 NaN 6.6 7.6 0.0 NaN NaN 0.34 NaN 0.61 1.32 -0.03 NaN NaN 32.20% 65.70% 46.60% 66.10% 86.70% 78.60% 43.10% 61.40% 10.00% 0 0 0 63 75 82 24.90% 5.40% 3.09 4.51 -0.34 41.10% NaN 32.00% 63.60% 46.70% 61.80% 88.20% 78.50% 46.30% 22.6 5.6 0.0 0.3 0.2 30 - 30 19.40% 40.90% 28.80% 30.40% 21.40% 51.60% 27.00% 2.94 32.10% 2.60% 31.30% 63.70% 46.60% 62.00% 87.70% 78.60% 47.30% 22.6 1.5 117 68 173 92 74 94 86 101 105 123 71 99 97 104 77 99 85 121 118 97 94 4930
67 2010 Roy Oswalt - - - 13 13 0 33 32 211.2 8.21 2.34 0.81 0.253 77.80% 45.70% 9.10% 2.76 3.27 3.31 4.4 32 2.76 2 2 0 0 211.2 837 162 70 65 19 55 2 5 2 1 193 259 208 100 17 1037 2157 3194 74 13 17 2 8.21 2.34 3.51 6.89 0.81 0.208 1.03 0.253 77.80% 3.27 1.25 17.60% 45.70% 36.70% 8.20% 9.10% 5.00% 11.80% 39.9 210.2 3.35 3.31 3.69 33.41 3.58 0.92 0.94 0.88 1.05 31 3.25 0.75 55.30% 92.6 14.80% 84.3 NaN NaN 14.70% 72.7 15.10% 83.0 NaN NaN NaN NaN 2.40% NaN 14.6 4.4 NaN 5.8 6.3 NaN NaN 0.85 0.96 NaN 1.27 1.33 NaN NaN 26.70% 65.50% 47.00% 65.00% 84.30% 79.10% 52.30% 65.10% 9.90% 0 0 0 69 82 83 23.10% 6.60% 3.34 3.15 -0.51 30.00% NaN 28.40% 61.50% 46.80% 62.90% 84.60% 78.70% 55.60% 21.3 6.3 2.1 -0.2 1.9 32 - 32 16.50% 33.10% 40.60% 26.40% 18.70% 54.10% 27.20% 3.30 31.90% 15.20% 27.80% 61.80% 46.80% 64.30% 83.90% 78.70% 55.80% 21.7 0.3 111 71 157 78 87 83 76 86 108 120 76 96 102 100 97 86 111 105 102 105 90 571
83 2011 Doug Fister - - - 11 13 0 32 31 216.1 6.07 1.54 0.46 0.272 73.30% 47.50% 5.10% 2.83 3.02 3.61 5.4 27 2.83 3 0 0 0 216.1 875 193 76 68 11 37 2 12 3 1 146 320 216 137 24 1037 2158 3195 78 22 7 1 6.07 1.54 3.95 8.03 0.46 0.234 1.06 0.272 73.30% 3.02 1.48 20.40% 47.50% 32.10% 11.10% 5.10% 6.90% 14.30% 47.5 213.1 3.35 3.61 2.18 23.64 2.50 0.95 0.89 0.89 1.15 29 2.52 -0.22 57.00% 90.0 17.20% 86.6 NaN NaN 15.10% 75.2 10.70% 82.8 NaN NaN NaN NaN 0.40% NaN 23.7 -3.1 NaN 3.5 4.6 NaN NaN 1.30 -0.56 NaN 0.73 1.35 NaN NaN 36.40% 57.10% 45.80% 77.30% 90.90% 85.00% 45.40% 64.60% 6.90% 0 0 0 73 77 89 16.70% 4.20% 3.66 3.24 -0.19 34.40% 25.20% 32.50% 57.40% 45.80% 73.60% 90.70% 85.10% 53.50% 18.5 6.0 1.2 -0.6 0.6 27 - 27 12.50% 38.80% 37.20% 24.00% 25.70% 52.80% 21.50% 3.71 22.20% NaN 31.80% 57.50% 45.80% 72.10% 91.00% 85.10% 54.70% 19.2 -2.2 87 50 173 91 46 92 80 94 102 92 53 105 108 87 52 96 107 97 109 102 87 9425
121 2014 Jeff Samardzija - - - 7 13 0 33 33 219.2 8.28 1.76 0.82 0.283 73.20% 50.20% 10.60% 2.99 3.20 3.07 4.4 29 2.99 2 0 0 0 219.2 879 191 86 73 20 43 3 10 10 0 202 309 188 119 20 1124 2215 3339 99 16 7 0 8.28 1.76 4.70 7.83 0.82 0.231 1.07 0.283 73.20% 3.20 1.64 19.30% 50.20% 30.50% 10.60% 10.60% 5.20% 0.00% 38.9 219.2 3.35 3.07 1.04 8.60 1.00 0.98 0.94 0.87 1.09 31 1.43 -0.37 55.20% 94.4 20.60% 86.0 12.30% 92.5 NaN NaN NaN NaN 11.90% 85.3 NaN NaN 0.30% NaN 9.2 7.9 -3.6 NaN NaN 3.6 NaN 0.50 1.15 -0.87 NaN NaN 0.91 NaN 33.40% 66.50% 48.70% 62.90% 85.60% 77.20% 46.20% 65.50% 11.10% 0 0 0 82 86 82 23.00% 4.90% 3.06 4.06 -0.21 30.90% 24.10% 33.20% 63.30% 48.70% 59.90% 85.90% 77.30% 51.50% 23.0 4.2 0.7 -0.9 -0.2 29 - 29 18.10% 38.00% 37.60% 24.40% 19.90% 55.40% 24.70% 3.10 30.20% NaN 33.50% 62.70% 48.70% 59.80% 85.90% 77.30% 52.30% 23.9 -3.1 107 61 176 91 94 94 84 96 100 113 64 93 112 88 112 95 110 95 109 105 85 3254
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
730 2019 Homer Bailey - - - 12 8 0 27 27 139.2 8.25 3.09 1.29 0.304 70.10% 44.80% 14.50% 4.96 4.36 4.47 2.0 33 4.96 0 0 0 0 139.2 604 143 78 77 20 48 0 4 4 1 128 187 138 92 16 879 1589 2468 108 10 4 1 8.25 3.09 2.67 9.21 1.29 0.259 1.37 0.304 70.10% 4.36 1.36 22.10% 44.80% 33.10% 11.60% 14.50% 5.30% 25.00% 21.2 139.2 4.75 4.47 -0.33 -4.14 -0.44 0.90 0.86 0.87 0.95 27 0.07 -0.44 50.30% 93.0 13.30% 87.3 1.20% 89.1 9.30% 78.4 NaN NaN 26.00% 84.1 NaN NaN 1.20% NaN 2.9 -0.4 -1.7 -2.8 NaN 1.1 NaN 0.24 -0.12 -6.18 -1.22 NaN 0.17 NaN 31.80% 70.50% 48.10% 65.60% 86.80% 78.70% 42.20% 63.60% 10.30% 0 0 0 108 97 97 21.20% 8.00% 4.66 6.96 0.60 47.60% 2.70% 30.10% 66.50% 47.80% 60.20% 87.60% 78.80% 48.80% 24.4 1.6 -0.2 -0.2 -0.5 33 - 33 13.30% 40.60% 33.30% 26.10% 15.00% 40.90% 44.20% 4.66 32.50% NaN 30.20% 65.60% 47.60% 61.60% 87.00% 78.80% 49.10% 24.9 0.1 94 94 100 104 89 103 101 102 98 94 94 104 106 91 95 99 98 105 89 90 117 8362
762 2016 Wade Miley - - - 9 13 0 30 30 166.0 7.43 2.66 1.36 0.328 68.60% 47.30% 16.20% 5.37 4.45 4.04 1.8 29 5.37 1 1 0 0 166.0 711 187 100 99 25 49 1 6 8 2 137 241 154 115 9 960 1772 2732 90 12 9 4 7.43 2.66 2.80 10.14 1.36 0.285 1.42 0.328 68.60% 4.45 1.56 22.50% 47.30% 30.20% 5.80% 16.20% 5.00% 44.40% 17.3 166.0 4.93 4.04 -2.59 -18.46 -1.91 0.88 0.83 0.86 0.96 29 -1.53 -1.40 51.20% 90.3 19.50% 84.1 NaN NaN 11.80% 77.0 17.60% 82.4 NaN NaN NaN NaN 0.10% NaN -10.5 -1.5 NaN -0.6 3.1 NaN NaN -0.75 -0.29 NaN -0.20 0.64 NaN NaN 32.50% 63.70% 46.20% 69.10% 88.90% 81.00% 43.80% 60.20% 8.80% 0 0 0 131 105 95 19.30% 6.90% 4.21 4.88 0.92 38.20% 10.90% 33.80% 61.40% 46.20% 67.30% 90.20% 81.00% 44.90% 17.8 0.4 -1.2 -0.2 -1.4 29 - 29 12.40% 42.20% 37.20% 20.60% 17.30% 49.30% 33.30% 4.26 29.70% 17.50% 33.10% 62.80% 46.10% 67.50% 90.40% 81.20% 43.90% 18.4 -2.2 93 88 106 114 111 112 108 110 94 92 87 111 107 85 125 104 109 82 93 99 106 8779
766 2017 Jeremy Hellickson - - - 8 11 0 30 30 164.0 5.27 2.58 1.92 0.246 66.30% 34.90% 15.10% 5.43 5.77 5.51 0.3 30 5.43 0 0 0 0 164.0 695 160 105 99 35 47 2 8 4 1 96 185 232 113 27 967 1651 2618 89 6 14 1 5.27 2.58 2.04 8.78 1.92 0.250 1.26 0.246 66.30% 5.77 0.80 21.30% 34.90% 43.80% 11.60% 15.10% 3.20% 7.10% 3.0 163.3 6.26 5.51 -2.34 -21.49 -2.17 0.93 0.91 0.88 0.94 30 -1.17 -1.35 46.10% 90.2 NaN NaN 11.80% 86.9 12.60% 76.6 29.50% 81.3 NaN NaN NaN NaN 0.30% NaN 3.9 NaN -2.9 -1.7 -11.2 NaN NaN 0.32 NaN -0.95 -0.50 -1.45 NaN NaN 30.00% 69.90% 47.90% 74.40% 87.00% 82.60% 44.80% 59.10% 8.30% 0 0 0 124 129 127 13.80% 6.80% 5.37 4.88 -0.33 37.90% 8.80% 32.30% 66.90% 47.90% 73.00% 88.10% 82.50% 45.00% 25.9 0.4 2.1 -2.0 0.1 30 - 30 7.10% 45.40% 28.70% 25.90% 19.90% 48.20% 32.00% 5.08 25.60% 30.30% 32.90% 67.10% 47.80% 71.60% 89.10% 82.30% 43.60% 26.9 -1.4 63 78 81 100 153 99 94 82 91 64 79 105 79 124 110 115 82 102 104 98 101 4371
771 2013 Edinson Volquez - - - 9 12 0 33 32 170.1 7.50 4.07 1.00 0.325 64.50% 47.60% 11.90% 5.71 4.24 4.07 0.8 29 5.71 0 0 0 0 170.1 777 193 114 108 19 77 2 3 16 0 142 257 160 123 11 1178 1835 3013 82 24 15 5 7.50 4.07 1.84 10.20 1.00 0.277 1.59 0.325 64.50% 4.24 1.61 22.80% 47.60% 29.60% 6.90% 11.90% 9.30% 33.30% 7.3 169.1 4.73 4.07 -2.83 -38.00 -4.13 0.97 0.86 0.86 0.81 33 -3.41 0.48 54.60% 92.5 NaN NaN NaN NaN 24.70% 79.1 20.70% 83.2 NaN NaN NaN NaN 0.30% NaN -8.2 NaN NaN -10.8 -8.1 NaN NaN -0.50 NaN NaN -1.46 -1.30 NaN NaN 26.60% 62.60% 42.60% 62.50% 88.40% 79.50% 44.50% 56.80% 8.70% 0 0 0 160 115 108 18.30% 9.90% 4.26 4.33 1.47 19.80% NaN 27.50% 59.70% 42.70% 59.60% 89.80% 79.50% 47.00% 25.4 -2.4 -1.4 -1.8 -3.2 29 - 29 8.40% 41.10% 36.40% 22.50% 15.50% 53.30% 31.20% 4.29 30.60% 20.70% 27.00% 59.70% 42.70% 58.70% 89.70% 79.50% 47.80% 25.9 0.9 100 137 73 119 113 112 124 111 88 93 127 107 104 90 117 108 100 88 93 99 105 3990
774 2016 James Shields - - - 6 19 0 33 33 181.2 6.69 4.06 1.98 0.302 72.70% 40.40% 17.80% 5.85 6.01 5.21 -1.0 34 5.85 1 0 0 0 181.2 822 208 122 118 40 82 3 8 10 0 135 238 225 126 15 1277 1856 3133 65 16 6 2 6.69 4.06 1.65 10.30 1.98 0.284 1.60 0.302 72.70% 6.01 1.06 21.40% 40.40% 38.20% 6.70% 17.80% 6.70% 33.30% -11.5 181.2 6.31 5.21 -2.00 -29.77 -3.08 0.96 0.89 0.87 0.88 32 -4.72 2.65 44.00% 90.4 1.50% 80.9 18.60% 86.3 15.60% 77.0 20.40% 83.6 NaN NaN NaN NaN 0.40% NaN -17.1 -1.2 -13.3 -1.6 -6.8 NaN NaN -1.25 -2.70 -2.30 -0.34 -1.07 NaN NaN 28.50% 68.20% 43.50% 63.70% 89.00% 78.70% 37.70% 54.70% 9.20% 0 0 0 142 143 124 16.40% 10.00% 5.12 3.22 -0.16 25.10% 18.10% 29.00% 63.70% 43.60% 60.80% 89.70% 78.60% 42.10% 23.4 -0.7 -0.2 0.5 0.3 34 - 34 6.50% 42.90% 32.80% 24.40% 15.30% 50.90% 33.80% 4.97 31.30% 20.50% 29.00% 64.10% 43.60% 60.50% 90.10% 78.60% 41.50% 24.1 -1.4 83 131 63 117 166 113 121 101 100 78 123 104 91 109 138 107 95 96 82 102 108 7059

67 rows × 187 columns

In [143]:
pw['Season'].unique()
Out[143]:
array([2015, 2018, 2014, 2013, 2016, 2017, 2010, 2011, 2019, 2012])
In [144]:
pw__ = pw[pw['Name'] == 'David Price']
pw__
Out[144]:
Season Name Team W L SV G GS IP K/9 BB/9 HR/9 BABIP LOB% GB% HR/FB ERA FIP xFIP WAR Age ERA.1 CG ShO SV.1 BS IP.1 TBF H R ER HR BB IBB HBP WP BK SO GB FB LD IFFB Balls Strikes Pitches RS IFH BU BUH K/9.1 BB/9.1 K/BB H/9 HR/9.1 AVG WHIP BABIP.1 LOB%.1 FIP.1 GB/FB LD% GB%.1 FB% IFFB% HR/FB.1 IFH% BUH% Starting Start-IP tERA xFIP.1 WPA RE24 REW pLI inLI gmLI exLI Pulls WPA/LI Clutch FB%.1 FBv SL% SLv CT% CTv CB% CBv CH% CHv SF% SFv KN% KNv XX% PO% wFB wSL wCT wCB wCH wSF wKN wFB/C wSL/C wCT/C wCB/C wCH/C wSF/C wKN/C O-Swing% Z-Swing% Swing% O-Contact% Z-Contact% Contact% Zone% F-Strike% SwStr% HLD SD MD ERA- FIP- xFIP- K% BB% SIERA RS/9 E-F FA% (pfx) FT% (pfx) O-Swing% (pfx) Z-Swing% (pfx) Swing% (pfx) O-Contact% (pfx) Z-Contact% (pfx) Contact% (pfx) Zone% (pfx) Pace RA9-WAR BIP-Wins LOB-Wins FDP-Wins Age Rng K-BB% Pull% Cent% Oppo% Soft% Med% Hard% kwERA TTO% CH% (pi) O-Swing% (pi) Z-Swing% (pi) Swing% (pi) O-Contact% (pi) Z-Contact% (pi) Contact% (pi) Zone% (pi) Pace (pi) FRM K/9+ BB/9+ K/BB+ H/9+ HR/9+ AVG+ WHIP+ BABIP+ LOB%+ K%+ BB%+ LD%+ GB%+ FB%+ HR/FB%+ Pull%+ Cent%+ Oppo%+ Soft%+ Med%+ Hard%+ playerid
32 2015 David Price - - - 18 5 0 32 32 220.1 9.19 1.92 0.69 0.290 78.60% 40.40% 7.80% 2.45 2.78 3.24 6.7 29 2.45 3 1 0 0 220.1 888 190 70 60 17 47 2 3 4 0 225 243 219 139 24 1066 2322 3388 137 17 12 3 9.19 1.92 4.79 7.76 0.69 0.227 1.08 0.290 78.60% 2.78 1.11 23.10% 40.40% 36.40% 11.00% 7.80% 7.00% 25.00% 58.3 220.1 3.36 3.24 3.70 36.42 3.80 0.98 0.93 0.85 1.24 29 3.41 0.35 53.40% 94.2 NaN NaN 15.90% 89.6 8.00% 79.7 22.70% 85.2 NaN NaN NaN NaN 0.40% NaN 19.7 NaN -0.5 1.5 10.6 NaN NaN 1.09 NaN -0.09 0.57 1.38 NaN NaN 33.90% 68.60% 50.80% 66.70% 81.70% 76.60% 48.80% 67.00% 11.90% 0 0 0 60 67 80 25.30% 5.30% 3.27 5.60 -0.32 32.30% 22.20% 32.90% 66.00% 50.70% 63.50% 82.00% 76.40% 53.90% 26.0 7.4 0.2 0.5 0.7 29 - 29 20.10% 33.30% 36.70% 30.00% 17.00% 54.80% 28.20% 3.08 32.60% 22.70% 33.30% 65.50% 50.70% 62.70% 82.40% 76.50% 54.10% 26.3 -1.9 120 66 181 89 66 91 83 99 108 126 70 112 91 104 68 83 106 120 90 104 100 3184
43 2012 David Price Rays 20 5 0 31 31 211.0 8.74 2.52 0.68 0.285 81.10% 53.10% 10.50% 2.56 3.05 3.12 4.3 26 2.56 2 1 0 0 211.0 836 173 63 60 16 59 2 5 8 1 205 299 152 112 14 1165 2167 3332 126 20 4 0 8.74 2.52 3.47 7.38 0.68 0.224 1.10 0.285 81.10% 3.05 1.97 19.90% 53.10% 27.00% 9.20% 10.50% 6.70% 0.00% 40.4 211.0 3.21 3.12 3.42 37.04 4.00 0.98 0.92 0.87 1.19 29 3.36 0.14 60.70% 95.5 NaN NaN 15.70% 89.3 11.30% 79.0 12.30% 84.4 NaN NaN NaN NaN 0.20% NaN 18.9 NaN 5.2 6.9 4.4 NaN NaN 0.94 NaN 1.00 1.84 1.07 NaN NaN 26.90% 63.70% 43.90% 68.10% 87.10% 80.80% 46.10% 62.70% 8.40% 0 0 0 66 75 75 24.50% 7.10% 3.23 5.37 -0.49 25.30% 35.90% 27.20% 58.70% 43.90% 65.70% 86.60% 80.50% 52.90% 22.1 6.2 0.5 1.3 1.8 26 - 26 17.50% 35.30% 40.20% 24.50% 15.30% 59.10% 25.60% 3.33 33.50% 12.10% 27.80% 58.10% 43.70% 64.80% 86.60% 80.00% 52.60% 22.6 6.3 118 83 142 84 63 89 84 98 112 126 89 96 119 78 90 88 115 99 99 106 89 3184
60 2010 David Price Rays 19 6 0 32 31 208.2 8.11 3.41 0.65 0.270 78.50% 43.70% 6.50% 2.72 3.42 3.83 4.3 24 2.72 2 1 0 0 208.2 861 170 71 63 15 79 1 5 5 3 188 254 230 97 22 1160 2195 3355 139 14 8 1 8.11 3.41 2.38 7.33 0.65 0.219 1.19 0.270 78.50% 3.42 1.10 16.70% 43.70% 39.60% 9.60% 6.50% 5.50% 12.50% 40.0 207.2 3.20 3.83 2.18 26.76 2.94 0.99 0.92 0.86 1.13 30 1.58 0.63 74.00% 94.6 4.90% 86.5 NaN NaN 15.60% 77.5 5.50% 84.2 NaN NaN NaN NaN 1.10% NaN 19.7 1.2 NaN 1.5 3.4 NaN NaN 0.80 0.74 NaN 0.29 1.88 NaN NaN 30.20% 66.90% 47.80% 68.50% 84.60% 79.30% 47.80% 62.60% 9.90% 0 0 0 69 84 92 21.80% 9.20% 3.83 6.00 -0.70 56.90% 17.50% 29.00% 62.40% 47.60% 66.10% 83.90% 79.10% 55.50% 21.3 5.9 1.1 0.4 1.6 24 - 24 12.70% 30.10% 39.10% 30.90% 19.90% 54.70% 25.50% 3.76 32.80% 5.00% 28.50% 63.00% 47.60% 68.10% 83.10% 79.10% 55.20% 21.9 -1.2 119 105 112 83 66 86 89 93 109 123 109 93 100 103 70 75 111 124 111 105 85 3184
199 2014 David Price - - - 15 12 0 34 34 248.1 9.82 1.38 0.91 0.306 72.70% 41.20% 9.70% 3.26 2.78 2.76 5.6 28 3.26 3 0 0 0 248.1 1009 230 100 90 25 38 1 5 2 0 271 280 259 140 22 1130 2600 3730 111 21 16 5 9.82 1.38 7.13 8.34 0.91 0.238 1.08 0.306 72.70% 2.78 1.08 20.60% 41.20% 38.10% 8.50% 9.70% 7.50% 31.30% 48.3 247.4 3.03 2.76 2.01 11.68 1.24 0.92 0.88 0.87 1.17 31 2.04 0.16 57.00% 93.2 NaN NaN 14.40% 86.9 8.90% 79.4 19.70% 84.9 NaN NaN NaN NaN 0.10% NaN 6.3 NaN 7.7 1.9 3.8 NaN NaN 0.30 NaN 1.44 0.58 0.51 NaN NaN 32.10% 66.60% 48.80% 67.00% 84.20% 78.30% 48.40% 69.90% 10.60% 0 0 0 87 75 73 26.90% 3.80% 2.72 4.02 0.48 17.20% 39.60% 31.30% 62.60% 48.80% 65.10% 83.50% 78.30% 56.00% 26.6 4.0 -0.5 -1.1 -1.6 28 - 28 23.10% 36.40% 35.10% 28.50% 16.10% 55.50% 28.40% 2.50 33.10% 19.80% 31.90% 62.00% 48.80% 65.30% 83.60% 78.30% 56.10% 27.2 3.8 128 48 269 96 102 96 84 103 100 133 49 99 94 107 103 89 104 113 89 104 99 3184
219 2013 David Price Rays 10 8 0 27 27 186.2 7.28 1.30 0.77 0.298 70.00% 44.90% 8.60% 3.33 3.03 3.27 4.0 27 3.33 4 0 0 0 186.2 740 178 78 69 16 27 0 3 6 0 151 249 185 120 23 845 1862 2707 86 10 5 2 7.28 1.30 5.59 8.58 0.77 0.251 1.10 0.298 70.00% 3.03 1.35 21.70% 44.90% 33.40% 12.40% 8.60% 4.00% 40.00% 35.5 186.2 3.42 3.27 1.62 14.99 1.60 1.03 0.98 0.88 1.47 23 2.48 -0.90 54.00% 93.5 NaN NaN 17.80% 88.3 11.40% 79.5 16.80% 83.9 NaN NaN NaN NaN 0.00% NaN 14.2 NaN 0.0 -6.4 8.9 NaN NaN 0.97 NaN -0.01 -2.09 1.95 NaN NaN 31.10% 65.30% 47.20% 73.30% 88.30% 83.10% 47.00% 67.70% 8.00% 0 0 0 86 78 82 20.40% 3.70% 3.36 4.15 0.30 19.60% 33.70% 29.90% 61.90% 47.20% 68.90% 88.80% 83.00% 54.00% 25.8 2.8 0.0 -1.2 -1.2 27 - 27 16.80% 34.70% 40.30% 25.00% 17.20% 54.20% 28.60% 3.29 26.20% 16.80% 30.40% 61.70% 47.20% 69.00% 89.00% 83.00% 53.70% 26.4 4.0 95 43 224 98 74 99 83 101 96 102 46 103 104 94 80 88 113 101 106 101 95 3184
293 2011 David Price Rays 12 13 0 34 34 224.1 8.75 2.53 0.88 0.281 73.30% 44.30% 9.70% 3.49 3.32 3.32 4.8 25 3.49 0 0 0 0 224.1 918 192 93 87 22 63 5 9 2 0 218 273 227 116 21 1341 2355 3696 98 23 12 2 8.75 2.53 3.46 7.70 0.88 0.227 1.14 0.281 73.30% 3.32 1.20 18.80% 44.30% 36.90% 9.30% 9.70% 8.40% 16.70% 44.1 224.1 3.38 3.32 -0.14 11.49 1.14 0.95 0.86 0.88 0.86 34 1.86 -2.00 70.50% 94.8 9.60% 89.2 NaN NaN 9.10% 78.4 10.80% 83.1 NaN NaN NaN NaN 0.50% NaN 13.6 1.2 NaN -5.7 10.4 NaN NaN 0.53 0.35 NaN -1.72 2.61 NaN NaN 30.60% 61.10% 44.20% 73.10% 85.70% 80.90% 44.70% 60.00% 8.40% 0 0 0 90 84 82 23.80% 6.90% 3.26 3.93 0.17 36.80% 34.30% 27.30% 59.00% 44.10% 69.90% 85.20% 80.80% 53.30% 20.6 4.6 0.7 -0.9 -0.3 25 - 25 16.90% 34.40% 36.60% 29.00% 22.90% 52.40% 24.70% 3.18 33.00% 11.00% 27.70% 58.90% 44.20% 70.80% 85.00% 80.80% 52.70% 21.0 -3.9 126 82 154 87 89 90 86 96 102 131 85 97 101 100 99 85 105 117 97 101 101 3184
332 2018 David Price Red Sox 16 7 0 30 30 176.0 9.05 2.56 1.28 0.274 77.30% 40.10% 13.20% 3.58 4.02 3.95 2.4 32 3.58 1 0 0 0 176.0 722 151 75 70 25 50 0 10 1 0 177 193 189 99 25 940 1776 2716 102 11 4 1 9.05 2.56 3.54 7.72 1.28 0.228 1.14 0.274 77.30% 4.02 1.02 20.60% 40.10% 39.30% 13.20% 13.20% 5.70% 25.00% 23.6 176.0 4.41 3.95 2.87 23.81 2.36 0.97 0.93 0.88 1.30 29 2.75 0.20 46.80% 92.7 NaN NaN 28.10% 88.7 2.90% 79.1 22.20% 85.3 NaN NaN NaN NaN NaN NaN 16.3 NaN 0.2 -5.7 -1.5 NaN NaN 1.28 NaN 0.03 -7.23 -0.24 NaN NaN 30.90% 67.90% 47.30% 68.50% 86.20% 79.80% 44.30% 62.30% 9.60% 0 0 0 80 96 93 24.50% 6.90% 3.82 5.22 -0.44 12.70% 34.40% 29.40% 64.20% 47.30% 63.00% 87.00% 79.80% 51.30% 26.7 3.7 0.7 0.6 1.3 32 - 32 17.60% 40.40% 35.30% 24.30% 19.40% 48.50% 32.20% 3.69 34.90% 22.20% 29.20% 64.30% 47.30% 63.00% 86.90% 79.80% 51.50% 26.9 3.1 107 80 133 90 105 92 87 93 107 111 84 97 95 108 103 99 103 97 109 104 91 3184
494 2016 David Price Red Sox 17 9 0 35 35 230.0 8.92 1.96 1.17 0.310 73.60% 43.70% 13.50% 3.99 3.60 3.52 4.4 30 3.99 2 0 0 0 230.0 951 227 106 102 30 50 1 7 4 0 228 286 222 146 23 1158 2437 3595 169 14 12 3 8.92 1.96 4.56 8.88 1.17 0.254 1.20 0.310 73.60% 3.60 1.29 22.30% 43.70% 33.90% 10.40% 13.50% 4.90% 25.00% 41.9 230.0 4.27 3.52 0.53 13.19 1.36 0.94 0.90 0.87 0.99 33 1.71 -1.15 48.80% 92.9 NaN NaN 19.70% 88.3 8.60% 79.0 22.90% 84.5 NaN NaN NaN NaN 0.10% NaN -1.9 NaN -2.2 -2.9 15.6 NaN NaN -0.11 NaN -0.32 -0.93 1.90 NaN NaN 32.90% 70.60% 50.70% 66.70% 81.60% 76.50% 47.20% 64.90% 11.90% 0 0 0 91 86 83 24.00% 5.30% 3.60 6.61 0.39 12.40% 37.20% 32.40% 67.80% 50.70% 64.60% 81.80% 76.50% 51.80% 25.6 4.3 -0.6 0.6 -0.1 30 - 30 18.70% 44.10% 31.70% 24.20% 18.60% 46.60% 34.80% 3.50 32.40% 23.00% 32.40% 67.00% 50.50% 65.90% 80.90% 76.30% 52.10% 26.0 0.7 111 65 173 100 96 100 91 104 100 115 66 110 99 96 104 108 93 97 100 93 111 3184
In [145]:
print(pw.dtypes)
Season       int64
Name        object
Team        object
W            int64
L            int64
             ...  
Oppo%+       int64
Soft%+       int64
Med%+        int64
Hard%+       int64
playerid     int64
Length: 187, dtype: object
In [146]:
pw.describe()
Out[146]:
Season W L SV G GS IP K/9 BB/9 HR/9 BABIP ERA FIP xFIP WAR Age ERA.1 CG ShO SV.1 BS IP.1 TBF H R ER HR BB IBB HBP WP BK SO GB FB LD IFFB Balls Strikes Pitches RS IFH BU BUH K/9.1 BB/9.1 K/BB H/9 HR/9.1 AVG WHIP BABIP.1 FIP.1 GB/FB Starting Start-IP tERA xFIP.1 WPA RE24 REW pLI inLI gmLI exLI Pulls WPA/LI Clutch FBv SLv CTv CBv CHv SFv KNv PO% wFB wSL wCT wCB wCH wSF wKN wFB/C wSL/C wCT/C wCB/C wCH/C wSF/C wKN/C HLD SD MD ERA- FIP- xFIP- SIERA RS/9 E-F Pace RA9-WAR BIP-Wins LOB-Wins FDP-Wins kwERA Pace (pi) FRM K/9+ BB/9+ K/BB+ H/9+ HR/9+ AVG+ WHIP+ BABIP+ LOB%+ K%+ BB%+ LD%+ GB%+ FB%+ HR/FB%+ Pull%+ Cent%+ Oppo%+ Soft%+ Med%+ Hard%+ playerid
count 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.00000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 569.000000 303.000000 670.000000 726.000000 79.000000 9.000000 0.0 776.000000 570.000000 305.000000 670.000000 726.000000 79.000000 9.000000 776.000000 570.000000 305.000000 670.000000 726.000000 79.000000 9.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000 776.000000
mean 2014.069588 12.231959 9.994845 0.010309 31.189433 30.817010 190.023196 7.728338 2.680863 1.001392 0.289720 3.750219 3.837887 3.881843 2.915335 28.639175 3.750219 1.015464 0.461340 0.010309 0.005155 190.023196 794.497423 179.601804 85.289948 78.564433 20.930412 56.198454 2.395619 6.475515 5.902062 0.564433 163.538660 251.019330 192.538660 113.176546 18.778351 1083.335052 1948.479381 3031.814433 100.145619 15.585052 11.369845 2.801546 7.728338 2.680863 3.126662 8.524175 1.001392 0.245418 1.245103 0.289720 3.837887 1.376353 27.113660 189.35567 4.174021 3.881843 0.718943 6.973673 0.735644 0.976289 0.907500 0.873621 1.085966 30.041237 0.657809 0.068505 91.590464 83.934446 87.625083 76.952537 83.813912 84.812658 76.622222 NaN 1.381314 4.101579 0.731475 0.729104 1.073416 0.921519 0.711111 0.052796 0.389386 -0.552230 -0.189687 -0.312906 -0.121266 0.542222 0.028351 0.081186 0.076031 92.780928 94.876289 95.766753 3.937887 4.753402 -0.087822 21.941366 3.098711 0.258247 -0.073840 0.182216 3.900992 22.563144 0.162371 99.198454 86.291237 124.470361 97.791237 95.220361 98.126289 94.744845 98.342784 101.630155 101.301546 87.447165 99.533505 101.246134 98.677835 97.478093 100.836340 99.591495 99.199742 100.347938 100.429124 99.121134 6432.974227
std 2.834407 3.368976 2.903164 0.101075 2.156207 2.217239 20.629595 1.659431 0.726437 0.306698 0.022667 0.753442 0.644552 0.569650 1.574829 3.919014 0.753442 1.392058 0.766088 0.101075 0.071657 20.629595 78.692214 25.137813 15.626353 14.522986 5.995508 14.687780 2.076456 3.319451 3.730010 0.836875 40.241478 49.628227 41.249936 17.406443 7.743624 129.768354 205.274016 304.195831 20.049762 5.231466 5.261283 1.952810 1.659431 0.726437 1.236379 1.037019 0.306698 0.023333 0.140061 0.022667 0.644552 0.452573 13.800929 21.22388 0.715253 0.569650 1.736044 16.947970 1.775658 0.053184 0.046572 0.020826 0.172534 2.312011 1.701212 0.758467 2.491925 2.837899 2.249716 3.352426 2.994420 3.081889 1.204967 NaN 11.515071 7.467832 6.105126 5.678155 6.184708 5.111467 12.324411 0.703151 2.188329 4.060364 1.683399 2.399016 2.184568 1.663804 0.201210 0.340560 0.341749 18.373640 15.510085 13.418177 0.565670 0.895687 0.513446 1.997024 2.012817 1.029267 0.887969 1.240559 0.597009 2.004088 2.224057 19.667541 23.341564 47.910768 11.790766 26.819476 9.248932 10.647276 7.696067 6.002672 21.756999 22.161455 9.256428 13.974077 17.181816 22.191647 8.404624 6.173147 11.762490 13.097848 5.067047 10.017057 4492.928490
min 2010.000000 4.000000 2.000000 0.000000 23.000000 23.000000 139.200000 3.670000 0.690000 0.270000 0.202000 1.660000 1.810000 2.080000 -1.600000 20.000000 1.660000 0.000000 0.000000 0.000000 0.000000 139.200000 576.000000 109.000000 41.000000 38.000000 5.000000 16.000000 0.000000 0.000000 0.000000 0.000000 73.000000 113.000000 90.000000 63.000000 2.000000 700.000000 1453.000000 2171.000000 47.000000 1.000000 0.000000 0.000000 3.670000 0.690000 1.180000 5.320000 0.270000 0.165000 0.770000 0.202000 1.810000 0.380000 -16.300000 133.10000 1.840000 2.080000 -4.070000 -38.000000 -4.130000 0.800000 0.760000 0.750000 0.480000 20.000000 -4.720000 -2.180000 81.500000 74.300000 79.400000 62.100000 70.800000 72.000000 75.600000 NaN -34.000000 -18.600000 -14.100000 -15.200000 -15.200000 -9.800000 -15.600000 -2.060000 -28.740000 -53.700000 -13.060000 -37.300000 -5.810000 -0.680000 0.000000 0.000000 0.000000 44.000000 49.000000 54.000000 2.090000 2.540000 -1.520000 15.900000 -2.400000 -2.500000 -2.700000 -3.200000 1.940000 16.200000 -7.300000 48.000000 24.000000 48.000000 60.000000 30.000000 65.000000 57.000000 68.000000 85.000000 49.000000 25.000000 72.000000 51.000000 54.000000 38.000000 75.000000 82.000000 64.000000 55.000000 83.000000 69.000000 106.000000
25% 2012.000000 10.000000 8.000000 0.000000 30.000000 29.000000 175.000000 6.607500 2.150000 0.790000 0.274000 3.240000 3.410000 3.520000 1.800000 26.000000 3.240000 0.000000 0.000000 0.000000 0.000000 175.000000 737.000000 163.000000 74.000000 69.000000 17.000000 46.000000 1.000000 4.000000 3.000000 0.000000 135.000000 216.750000 162.750000 101.000000 13.000000 991.000000 1811.000000 2808.500000 85.000000 12.000000 7.000000 1.000000 6.607500 2.150000 2.290000 7.857500 0.790000 0.231000 1.150000 0.274000 3.410000 1.057500 17.600000 174.00000 3.710000 3.520000 -0.452500 -4.265000 -0.432500 0.940000 0.880000 0.870000 0.970000 29.000000 -0.482500 -0.410000 90.300000 82.000000 86.250000 74.800000 82.300000 83.300000 76.000000 NaN -6.550000 -1.000000 -2.700000 -2.700000 -2.575000 -1.700000 -7.400000 -0.410000 -0.297500 -1.060000 -0.947500 -1.210000 -0.945000 -0.310000 0.000000 0.000000 0.000000 80.000000 85.000000 87.000000 3.590000 4.110000 -0.440000 20.500000 1.700000 -0.500000 -0.700000 -0.700000 3.520000 21.100000 -1.400000 86.000000 69.750000 92.000000 90.000000 76.000000 93.000000 88.000000 93.000000 98.000000 86.000000 71.000000 94.000000 93.000000 87.000000 82.000000 96.000000 96.000000 91.000000 91.000000 97.000000 92.000000 2717.000000
50% 2014.000000 12.000000 10.000000 0.000000 32.000000 31.000000 189.100000 7.615000 2.665000 0.980000 0.290000 3.710000 3.850000 3.880000 2.700000 28.000000 3.710000 1.000000 0.000000 0.000000 0.000000 189.100000 798.000000 180.000000 85.000000 78.000000 21.000000 55.000000 2.000000 6.000000 5.000000 0.000000 161.000000 250.000000 189.500000 113.000000 18.000000 1086.000000 1942.000000 3039.000000 99.000000 15.000000 11.000000 3.000000 7.615000 2.665000 2.890000 8.460000 0.980000 0.245000 1.250000 0.290000 3.850000 1.330000 25.500000 188.20000 4.225000 3.880000 0.630000 6.075000 0.645000 0.980000 0.910000 0.870000 1.070000 30.000000 0.620000 0.060000 91.700000 84.100000 87.800000 77.200000 84.000000 85.000000 76.100000 NaN 1.100000 2.800000 -0.300000 -0.100000 -0.100000 -0.100000 -1.100000 0.070000 0.615000 -0.100000 -0.035000 -0.035000 -0.180000 -0.040000 0.000000 0.000000 0.000000 92.000000 95.000000 97.000000 3.940000 4.675000 -0.120000 21.900000 2.900000 0.200000 -0.100000 0.200000 3.935000 22.500000 0.100000 98.000000 85.000000 115.000000 98.000000 93.000000 98.000000 95.000000 99.000000 102.000000 99.500000 86.000000 100.000000 101.000000 98.000000 97.000000 101.000000 100.000000 99.000000 99.000000 100.000000 99.000000 5447.500000
75% 2016.000000 14.000000 12.000000 0.000000 33.000000 32.000000 203.125000 8.800000 3.162500 1.200000 0.305250 4.265000 4.260000 4.222500 3.800000 31.000000 4.265000 1.000000 1.000000 0.000000 0.000000 203.125000 850.000000 196.000000 96.000000 88.000000 25.000000 66.000000 4.000000 8.000000 8.000000 1.000000 188.000000 283.000000 220.000000 125.000000 23.000000 1176.000000 2088.000000 3243.250000 113.000000 19.000000 15.000000 4.000000 8.800000 3.162500 3.642500 9.250000 1.200000 0.262000 1.350000 0.305250 4.260000 1.600000 35.425000 203.02500 4.632500 4.222500 1.760000 17.415000 1.820000 1.010000 0.940000 0.880000 1.180000 32.000000 1.722500 0.580000 93.100000 85.900000 89.000000 79.200000 85.700000 86.350000 77.100000 NaN 8.700000 9.075000 3.300000 3.000000 3.400000 3.050000 7.900000 0.522500 1.335000 0.730000 0.717500 0.940000 0.915000 0.500000 0.000000 0.000000 0.000000 104.250000 105.000000 105.000000 4.310000 5.310000 0.270000 23.200000 4.300000 1.000000 0.500000 1.000000 4.300000 23.900000 1.500000 114.000000 101.000000 146.000000 107.000000 112.250000 105.000000 102.000000 104.000000 106.000000 116.000000 101.000000 105.000000 111.000000 109.000000 112.000000 106.000000 103.000000 107.250000 109.000000 104.000000 106.000000 9492.000000
max 2019.000000 24.000000 19.000000 1.000000 42.000000 35.000000 251.000000 13.580000 5.220000 2.150000 0.352000 6.130000 6.010000 5.640000 9.000000 43.000000 6.130000 11.000000 6.000000 1.000000 1.000000 251.000000 1009.000000 262.000000 128.000000 118.000000 46.000000 105.000000 13.000000 19.000000 25.000000 5.000000 308.000000 439.000000 319.000000 174.000000 50.000000 1424.000000 2600.000000 3941.000000 189.000000 35.000000 29.000000 11.000000 13.580000 5.220000 11.630000 11.530000 2.150000 0.308000 1.670000 0.352000 6.010000 3.580000 75.700000 251.00000 6.450000 5.640000 6.760000 56.240000 6.240000 1.150000 1.060000 1.100000 1.730000 36.000000 6.370000 2.650000 98.000000 92.500000 94.300000 85.400000 92.300000 93.000000 79.500000 NaN 37.800000 30.700000 30.500000 38.200000 29.300000 17.800000 25.200000 1.950000 21.090000 14.460000 4.730000 9.760000 11.480000 4.790000 3.000000 3.000000 4.000000 160.000000 145.000000 137.000000 5.520000 8.130000 1.470000 28.800000 9.600000 4.000000 3.100000 4.700000 5.660000 29.300000 7.500000 156.000000 173.000000 487.000000 132.000000 231.000000 123.000000 128.000000 118.000000 125.000000 171.000000 163.000000 133.000000 145.000000 165.000000 174.000000 126.000000 119.000000 132.000000 142.000000 121.000000 137.000000 20633.000000
In [147]:
# Create a copy of the dataset
pw_cpy = pw.copy()

Find a list of uncorrelated variables

In [148]:
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy.drop(['Name', 'Team', 'Season'], axis=1)
corr_matrix = pw_float_vars.corr().abs()
In [149]:
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['H', 'BB', 'HBP', 'K/9', 'GB', 'FB', 'FIP', 'xFIP']]
corr_matrix = pw_float_vars.corr().abs()
In [150]:
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['BB', 'HBP', 'GB', 'FB', 'BABIP', 'Balls', 'Strikes', 'Pitches', 'SO']]
corr_matrix = pw_float_vars.corr().abs()
In [151]:
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['BB', 'HBP', 'GB', 'FB', 'BABIP', 'SO', 'WHIP', 'RE24']]
corr_matrix = pw_float_vars.corr().abs()
In [152]:
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['BB', 'HBP', 'GB', 'FB', 'RE24']]
corr_matrix = pw_float_vars.corr().abs()
In [153]:
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['HBP', 'GB', 'FB', 'RE24', 'tERA']]
corr_matrix = pw_float_vars.corr().abs()
In [154]:
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['HBP', 'GB', 'FB', 'tERA']]
corr_matrix = pw_float_vars.corr().abs()
In [155]:
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['FIP', 'xFIP', 'tERA']]
corr_matrix = pw_float_vars.corr().abs()
In [156]:
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['HBP', 'GB', 'FB', 'tERA', 'IFFB']]
corr_matrix = pw_float_vars.corr().abs()
In [157]:
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['HBP', 'GB', 'FB', 'SIERA', 'HR', 'ER']]
corr_matrix = pw_float_vars.corr().abs()
In [158]:
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['HBP', 'GB', 'FB', 'SIERA', 'R', 'WAR']]
corr_matrix = pw_float_vars.corr().abs()
In [159]:
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['HBP', 'GB', 'FB', 'SIERA', 'R']]
corr_matrix = pw_float_vars.corr().abs()
In [160]:
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['HBP', 'GB', 'FB', 'SIERA', 'IP', 'GS']]
corr_matrix = pw_float_vars.corr().abs()
In [161]:
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['HBP', 'GB', 'FB', 'SIERA', 'WP', 'BK', 'SO']]
corr_matrix = pw_float_vars.corr().abs()
In [162]:
# Create correlation matrix to look at highly correlated continuous variables
pw_float_vars = pw_cpy[['HBP', 'GB', 'FB', 'WP', 'BK', 'SO', 'IBB', 'LD', 
                        'RS', 'IFH', 'BU', 'BUH', 'Starting', 'AVG']]
corr_matrix = pw_float_vars.corr().abs()
In [163]:
# Generate a mask for the upper triangle
mask = np.zeros_like(corr_matrix, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
In [164]:
# Select the upper part of the triangle
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
In [165]:
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_matrix, mask=mask, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
Out[165]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a3c98cad0>
In [166]:
correlation = pw_float_vars.corr(method ='pearson') 
correlation
Out[166]:
HBP GB FB WP BK SO IBB LD RS IFH BU BUH Starting AVG
HBP 1.000000 0.065048 -0.031934 0.104853 0.031921 0.068160 -0.024520 -0.016395 -0.024947 0.040653 -0.056038 -0.038969 -0.089690 -0.047413
GB 0.065048 1.000000 -0.171409 0.120034 0.011947 -0.183057 0.124186 0.403454 0.146530 0.565708 0.271915 0.165426 0.012938 0.315559
FB -0.031934 -0.171409 1.000000 -0.205924 0.111239 -0.022665 -0.001045 0.317670 0.147695 -0.146227 0.183330 0.161607 -0.082208 0.046922
WP 0.104853 0.120034 -0.205924 1.000000 -0.001283 0.171540 -0.078123 -0.089304 0.034526 0.092672 -0.105062 -0.014540 0.037885 -0.050618
BK 0.031921 0.011947 0.111239 -0.001283 1.000000 0.010462 0.053254 -0.025894 -0.021515 0.019083 0.039564 0.008624 0.017039 -0.025168
SO 0.068160 -0.183057 -0.022665 0.171540 0.010462 1.000000 -0.129563 -0.137023 0.304258 -0.051789 -0.095430 -0.144691 0.740110 -0.636546
IBB -0.024520 0.124186 -0.001045 -0.078123 0.053254 -0.129563 1.000000 0.061968 -0.148479 0.045421 0.312099 0.025751 -0.187598 0.100477
LD -0.016395 0.403454 0.317670 -0.089304 -0.025894 -0.137023 0.061968 1.000000 0.119155 0.190157 0.285318 0.186733 -0.028843 0.448046
RS -0.024947 0.146530 0.147695 0.034526 -0.021515 0.304258 -0.148479 0.119155 1.000000 0.071324 -0.081352 -0.016629 0.308069 -0.178547
IFH 0.040653 0.565708 -0.146227 0.092672 0.019083 -0.051789 0.045421 0.190157 0.071324 1.000000 0.233042 0.157639 0.028670 0.242969
BU -0.056038 0.271915 0.183330 -0.105062 0.039564 -0.095430 0.312099 0.285318 -0.081352 0.233042 1.000000 0.528844 -0.070075 0.112886
BUH -0.038969 0.165426 0.161607 -0.014540 0.008624 -0.144691 0.025751 0.186733 -0.016629 0.157639 0.528844 1.000000 -0.102491 0.220152
Starting -0.089690 0.012938 -0.082208 0.037885 0.017039 0.740110 -0.187598 -0.028843 0.308069 0.028670 -0.070075 -0.102491 1.000000 -0.542254
AVG -0.047413 0.315559 0.046922 -0.050618 -0.025168 -0.636546 0.100477 0.448046 -0.178547 0.242969 0.112886 0.220152 -0.542254 1.000000

Prepare dataset for analysis

Remove columns with NaN values and use only numeric variables.

In [167]:
pw_cpy = pw.copy()

Drop columns with any NaN

In [168]:
# Drop columns with any NaN
null_cols = pw_cpy.columns[pw_cpy.isnull().any()]
pw_cpy.drop(null_cols, axis = 1, inplace = True)

pw_cpy.head()
Out[168]:
Season Name Team W L SV G GS IP K/9 BB/9 HR/9 BABIP LOB% GB% HR/FB ERA FIP xFIP WAR Age ERA.1 CG ShO SV.1 BS IP.1 TBF H R ER HR BB IBB HBP WP BK SO GB FB LD IFFB Balls Strikes Pitches RS IFH BU BUH K/9.1 BB/9.1 K/BB H/9 HR/9.1 AVG WHIP BABIP.1 LOB%.1 FIP.1 GB/FB LD% GB%.1 FB% IFFB% HR/FB.1 IFH% BUH% Starting Start-IP tERA xFIP.1 WPA RE24 REW pLI inLI gmLI exLI Pulls WPA/LI Clutch FB%.1 FBv wFB wFB/C O-Swing% Z-Swing% Swing% O-Contact% Z-Contact% Contact% Zone% F-Strike% SwStr% HLD SD MD ERA- FIP- xFIP- K% BB% SIERA RS/9 E-F O-Swing% (pfx) Z-Swing% (pfx) Swing% (pfx) O-Contact% (pfx) Z-Contact% (pfx) Contact% (pfx) Zone% (pfx) Pace RA9-WAR BIP-Wins LOB-Wins FDP-Wins Age Rng K-BB% Pull% Cent% Oppo% Soft% Med% Hard% kwERA TTO% O-Swing% (pi) Z-Swing% (pi) Swing% (pi) O-Contact% (pi) Z-Contact% (pi) Contact% (pi) Zone% (pi) Pace (pi) FRM K/9+ BB/9+ K/BB+ H/9+ HR/9+ AVG+ WHIP+ BABIP+ LOB%+ K%+ BB%+ LD%+ GB%+ FB%+ HR/FB%+ Pull%+ Cent%+ Oppo%+ Soft%+ Med%+ Hard%+ playerid
0 2015 Zack Greinke Dodgers 19 3 0 32 32 222.2 8.08 1.62 0.57 0.229 86.50% 48.00% 7.30% 1.66 2.76 3.22 5.3 31 1.66 1 0 0 0 222.2 843 148 43 41 14 40 1 5 7 0 200 281 193 112 18 1161 2078 3239 101 12 12 0 8.08 1.62 5.00 5.98 0.57 0.185 0.84 0.229 86.50% 2.76 1.46 19.10% 48.00% 32.90% 9.30% 7.30% 4.30% 0.00% 46.9 222.2 2.81 3.22 6.76 56.24 6.24 1.06 1.03 0.87 1.22 31 5.47 0.90 50.70% 91.8 20.0 1.22 33.30% 69.10% 47.50% 60.60% 85.10% 74.80% 39.90% 64.10% 12.00% 0 0 0 44 73 83 23.70% 4.70% 3.27 4.08 -1.10 33.30% 64.90% 47.50% 57.80% 85.50% 74.80% 45.00% 22.9 9.4 4.0 0.1 4.1 31 - 31 19.00% 38.60% 36.10% 25.30% 21.70% 51.50% 26.80% 3.21 30.10% 33.60% 65.20% 47.50% 58.00% 85.90% 74.90% 44.10% 23.4 3.9 103 55 187 68 58 74 65 77 118 115 61 91 104 101 64 102 101 96 117 99 91 1943
1 2018 Jacob deGrom Mets 10 9 0 32 32 217.0 11.16 1.91 0.41 0.281 82.00% 46.40% 6.30% 1.70 1.99 2.60 9.0 30 1.70 1 0 0 0 217.0 835 152 48 41 10 46 3 5 2 0 269 235 160 111 26 999 2213 3212 86 16 9 2 11.16 1.91 5.85 6.30 0.41 0.194 0.91 0.281 82.00% 1.99 1.47 21.90% 46.40% 31.60% 16.30% 6.30% 6.80% 22.20% 75.7 217.0 2.16 2.60 5.85 52.57 5.70 1.14 1.06 0.88 1.01 31 4.62 0.52 52.10% 96.0 23.9 1.43 37.90% 68.80% 51.70% 57.70% 79.70% 70.80% 44.60% 66.50% 15.10% 0 0 0 45 49 64 32.20% 5.50% 2.78 3.57 -0.29 36.80% 66.30% 51.70% 53.90% 79.90% 70.70% 50.70% 21.3 9.6 0.4 0.1 0.5 30 - 30 26.70% 42.30% 32.00% 25.60% 25.20% 48.20% 26.60% 2.60 38.90% 36.70% 66.10% 51.80% 52.40% 80.40% 70.80% 51.30% 21.9 -2.1 130 58 226 75 38 80 70 96 112 144 64 101 106 92 50 106 92 101 138 104 76 10954
2 2015 Jake Arrieta Cubs 22 6 0 33 33 229.0 9.28 1.89 0.39 0.246 80.00% 56.20% 7.80% 1.77 2.35 2.61 7.0 29 1.77 4 3 0 0 229.0 870 150 52 45 10 48 2 6 6 0 236 318 129 119 12 1202 2236 3438 118 22 14 3 9.28 1.89 4.92 5.90 0.39 0.184 0.86 0.246 80.00% 2.35 2.47 21.00% 56.20% 22.80% 9.30% 7.80% 6.90% 21.40% 60.0 229.0 2.77 2.61 5.93 56.13 6.04 0.92 0.92 0.87 0.83 29 5.78 0.67 50.70% 94.6 23.7 1.36 34.20% 63.00% 46.90% 61.30% 86.70% 76.30% 44.10% 60.20% 11.10% 0 0 0 45 61 68 27.10% 5.50% 2.75 4.64 -0.58 32.80% 62.30% 47.00% 59.00% 85.90% 76.10% 47.90% 22.7 9.1 2.9 -0.8 2.1 29 - 29 21.60% 34.30% 37.80% 27.90% 22.80% 55.20% 22.10% 2.90 33.80% 33.00% 61.80% 46.90% 59.50% 85.60% 76.20% 48.40% 23.3 2.2 118 64 184 67 40 73 67 82 110 131 71 100 121 70 68 90 106 106 123 106 75 4153
3 2014 Clayton Kershaw Dodgers 21 3 0 27 27 198.1 10.85 1.41 0.41 0.278 81.60% 51.80% 6.60% 1.77 1.81 2.08 7.9 26 1.77 6 2 0 0 198.1 749 139 42 39 9 31 0 2 7 2 239 243 137 89 19 835 1887 2722 124 12 8 1 10.85 1.41 7.71 6.31 0.41 0.194 0.86 0.278 81.60% 1.81 1.77 19.00% 51.80% 29.20% 13.90% 6.60% 4.90% 12.50% 62.5 198.1 1.84 2.08 5.04 43.95 4.92 0.99 0.95 0.86 0.92 21 4.97 0.14 55.40% 93.0 19.2 1.27 36.50% 68.60% 52.50% 55.60% 82.20% 72.90% 49.70% 68.80% 14.20% 0 0 0 51 49 56 31.90% 4.10% 2.09 5.63 -0.04 35.40% 67.90% 52.50% 50.30% 83.70% 73.00% 52.60% 23.0 8.2 0.8 -0.5 0.2 26 - 26 27.80% 43.80% 34.80% 21.40% 24.50% 51.20% 24.30% 1.94 37.30% 37.10% 67.00% 52.50% 53.60% 83.10% 73.00% 51.50% 23.7 -2.2 139 49 286 74 49 79 68 94 111 155 54 91 113 88 68 111 100 82 133 98 82 2036
4 2013 Clayton Kershaw Dodgers 16 9 0 33 33 236.0 8.85 1.98 0.42 0.251 80.60% 46.00% 5.80% 1.83 2.39 2.88 7.2 25 1.83 3 2 0 0 236.0 908 164 55 48 11 52 2 3 12 2 232 278 189 137 18 1142 2286 3428 108 17 15 3 8.85 1.98 4.46 6.25 0.42 0.192 0.92 0.251 80.60% 2.39 1.47 22.70% 46.00% 31.30% 9.50% 5.80% 6.10% 20.00% 59.8 236.0 2.87 2.88 4.90 47.70 5.36 0.99 0.95 0.88 1.11 30 5.45 -0.51 60.70% 92.6 37.8 1.82 32.50% 65.80% 48.40% 60.90% 84.70% 76.30% 47.60% 65.10% 11.40% 0 0 0 51 64 76 25.60% 5.70% 2.99 4.12 -0.56 33.10% 63.70% 48.50% 55.00% 87.00% 76.10% 50.20% 22.4 8.9 2.5 -0.8 1.7 25 - 25 19.80% 43.10% 35.40% 21.50% 14.40% 56.70% 28.90% 2.92 32.50% 33.90% 63.50% 48.50% 56.30% 87.30% 76.30% 49.20% 23.4 -0.6 118 67 177 73 47 78 71 86 109 130 73 106 101 95 57 114 97 84 86 106 97 2036
In [169]:
# All numeric variables
all_num = pw_cpy.select_dtypes(include=['number'])
In [170]:
#list(all_num.columns.values.tolist())
In [171]:
pw_num_vars = pw_cpy[['Name', 'Team', 'Season', 'WAR', 'Strikes' ,'HBP', 'GB', 'FB', 'WP', 'BK', 
                         'SO', 'IBB', 'LD', 'RS', 'IFH', 'BU', 'BUH', 'AVG', 'Starting']]

# Include only numeric columns except playerid
pw_num_vars = pw_cpy[['WAR'
 ,'W'
 ,'L'
 ,'SV'
 ,'G'
 ,'GS'
 ,'IP'
 ,'K/9'
 ,'BB/9'
 ,'HR/9'
 ,'BABIP'
 #,'ERA'
 ,'FIP'
 #,'xFIP'
 ,'Age'
 #,'ERA.1'
 ,'CG'
 ,'ShO'
 ,'SV.1'
 ,'BS'
 ,'IP.1'
 ,'TBF'
 ,'H'
 ,'R'
 ,'ER'
 ,'HR'
 ,'BB'
 ,'IBB'
 ,'HBP'
 ,'WP'
 ,'BK'
 ,'SO'
 ,'GB'
 ,'FB'
 ,'LD'
 ,'IFFB'
 ,'Balls'
 ,'Strikes'
 ,'Pitches'
 ,'RS'
 ,'IFH'
 ,'BU'
 ,'BUH'
 ##,'K/9.1'
 ##,'BB/9.1'
 ,'K/BB'
 ,'H/9'
 ##,'HR/9.1'
 ,'AVG'
 #,'WHIP'
 ##,'BABIP.1'
 ##,'FIP.1'
 ,'GB/FB'
 ,'Starting'
 ,'Start-IP'
 #,'tERA'
 ##,'xFIP.1'
 ,'WPA'
 #,'RE24'
 ,'REW'
 ,'pLI'
 ,'inLI'
 ,'gmLI'
 ,'exLI'
 ,'Pulls'
 ,'WPA/LI'
 ,'Clutch'
 ,'FBv'
 ,'wFB'
 ,'wFB/C'
 ,'HLD'
 ,'SD'
 ,'MD'
 ##,'ERA-'
 ,'FIP-'
 ##,'xFIP-'
 #,'SIERA'
 ,'RS/9'
 ,'E-F'
 ,'Pace'
 #,'RA9-WAR'
 ,'BIP-Wins'
 ,'LOB-Wins'
 ,'FDP-Wins'
 ,'kwERA'
 ,'Pace (pi)'
 ,'FRM'
 ##,'K/9+'
 ##,'BB/9+'
 ##,'K/BB+'
 ##,'H/9+'
 ##,'HR/9+'
 ##,'AVG+'
 ##,'WHIP+'
 ##,'BABIP+'
 ##,'LOB%+'
 ,'K%+'
 ,'BB%+'
 ,'LD%+'
 ,'GB%+'
 ,'FB%+'
 ,'HR/FB%+'
 ,'Pull%+'
 ,'Cent%+'
 ,'Oppo%+'
 ,'Soft%+'
 ,'Med%+'
 ,'Hard%+']]

pw_num_vars.head()
Out[171]:
WAR W L SV G GS IP K/9 BB/9 HR/9 BABIP FIP Age CG ShO SV.1 BS IP.1 TBF H R ER HR BB IBB HBP WP BK SO GB FB LD IFFB Balls Strikes Pitches RS IFH BU BUH K/BB H/9 AVG GB/FB Starting Start-IP WPA REW pLI inLI gmLI exLI Pulls WPA/LI Clutch FBv wFB wFB/C HLD SD MD FIP- RS/9 E-F Pace BIP-Wins LOB-Wins FDP-Wins kwERA Pace (pi) FRM K%+ BB%+ LD%+ GB%+ FB%+ HR/FB%+ Pull%+ Cent%+ Oppo%+ Soft%+ Med%+ Hard%+
0 5.3 19 3 0 32 32 222.2 8.08 1.62 0.57 0.229 2.76 31 1 0 0 0 222.2 843 148 43 41 14 40 1 5 7 0 200 281 193 112 18 1161 2078 3239 101 12 12 0 5.00 5.98 0.185 1.46 46.9 222.2 6.76 6.24 1.06 1.03 0.87 1.22 31 5.47 0.90 91.8 20.0 1.22 0 0 0 73 4.08 -1.10 22.9 4.0 0.1 4.1 3.21 23.4 3.9 115 61 91 104 101 64 102 101 96 117 99 91
1 9.0 10 9 0 32 32 217.0 11.16 1.91 0.41 0.281 1.99 30 1 0 0 0 217.0 835 152 48 41 10 46 3 5 2 0 269 235 160 111 26 999 2213 3212 86 16 9 2 5.85 6.30 0.194 1.47 75.7 217.0 5.85 5.70 1.14 1.06 0.88 1.01 31 4.62 0.52 96.0 23.9 1.43 0 0 0 49 3.57 -0.29 21.3 0.4 0.1 0.5 2.60 21.9 -2.1 144 64 101 106 92 50 106 92 101 138 104 76
2 7.0 22 6 0 33 33 229.0 9.28 1.89 0.39 0.246 2.35 29 4 3 0 0 229.0 870 150 52 45 10 48 2 6 6 0 236 318 129 119 12 1202 2236 3438 118 22 14 3 4.92 5.90 0.184 2.47 60.0 229.0 5.93 6.04 0.92 0.92 0.87 0.83 29 5.78 0.67 94.6 23.7 1.36 0 0 0 61 4.64 -0.58 22.7 2.9 -0.8 2.1 2.90 23.3 2.2 131 71 100 121 70 68 90 106 106 123 106 75
3 7.9 21 3 0 27 27 198.1 10.85 1.41 0.41 0.278 1.81 26 6 2 0 0 198.1 749 139 42 39 9 31 0 2 7 2 239 243 137 89 19 835 1887 2722 124 12 8 1 7.71 6.31 0.194 1.77 62.5 198.1 5.04 4.92 0.99 0.95 0.86 0.92 21 4.97 0.14 93.0 19.2 1.27 0 0 0 49 5.63 -0.04 23.0 0.8 -0.5 0.2 1.94 23.7 -2.2 155 54 91 113 88 68 111 100 82 133 98 82
4 7.2 16 9 0 33 33 236.0 8.85 1.98 0.42 0.251 2.39 25 3 2 0 0 236.0 908 164 55 48 11 52 2 3 12 2 232 278 189 137 18 1142 2286 3428 108 17 15 3 4.46 6.25 0.192 1.47 59.8 236.0 4.90 5.36 0.99 0.95 0.88 1.11 30 5.45 -0.51 92.6 37.8 1.82 0 0 0 64 4.12 -0.56 22.4 2.5 -0.8 1.7 2.92 23.4 -0.6 130 73 106 101 95 57 114 97 84 86 106 97
In [172]:
pw_num_vars.shape
Out[172]:
(776, 83)

Question 4: K-means Clustering

K=3 is the best fit. K=3 is the 'elbow' of the arm in the SSE plot and visually provides the clearest separation of points (over 5, 8, 10 tried below).

In [173]:
# Scale and transform
mms = MinMaxScaler()
mms.fit(pw_num_vars)
data_transformed = mms.transform(pw_num_vars)
In [174]:
data_transformed.shape
Out[174]:
(776, 83)
In [175]:
pw_kmeans = pw_cpy.copy()
pw_kmeans.shape
Out[175]:
(776, 158)
In [176]:
# Evaluate k-means clustering with SSE
# k means determine k
Sum_of_squared_distances = []
K = range(1,15)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(data_transformed[ : , 1:83])
    Sum_of_squared_distances.append(km.inertia_)
    
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
In [177]:
clusterer = KMeans(5, random_state=1)
In [178]:
clusterer = KMeans(8, random_state=1)
In [179]:
clusterer = KMeans(10, random_state=1)
In [180]:
# K=3 is the 'elbow' of the arm and provides the clearest separation of points
clusterer = KMeans(3, random_state=1)
In [181]:
# Fit clusterer
clusterer.fit(data_transformed[ : , 1:83])
Out[181]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=1, tol=0.0001, verbose=0)
In [182]:
data_transformed[ : , 1:83].shape
Out[182]:
(776, 82)
In [183]:
pw_kmeans.shape
Out[183]:
(776, 158)
In [184]:
# Predict values
pw_kmeans['clust_grp'] = clusterer.predict(data_transformed[ : , 1:100])
pw_kmeans.head()
Out[184]:
Season Name Team W L SV G GS IP K/9 BB/9 HR/9 BABIP LOB% GB% HR/FB ERA FIP xFIP WAR Age ERA.1 CG ShO SV.1 BS IP.1 TBF H R ER HR BB IBB HBP WP BK SO GB FB LD IFFB Balls Strikes Pitches RS IFH BU BUH K/9.1 BB/9.1 K/BB H/9 HR/9.1 AVG WHIP BABIP.1 LOB%.1 FIP.1 GB/FB LD% GB%.1 FB% IFFB% HR/FB.1 IFH% BUH% Starting Start-IP tERA xFIP.1 WPA RE24 REW pLI inLI gmLI exLI Pulls WPA/LI Clutch FB%.1 FBv wFB wFB/C O-Swing% Z-Swing% Swing% O-Contact% Z-Contact% Contact% Zone% F-Strike% SwStr% HLD SD MD ERA- FIP- xFIP- K% BB% SIERA RS/9 E-F O-Swing% (pfx) Z-Swing% (pfx) Swing% (pfx) O-Contact% (pfx) Z-Contact% (pfx) Contact% (pfx) Zone% (pfx) Pace RA9-WAR BIP-Wins LOB-Wins FDP-Wins Age Rng K-BB% Pull% Cent% Oppo% Soft% Med% Hard% kwERA TTO% O-Swing% (pi) Z-Swing% (pi) Swing% (pi) O-Contact% (pi) Z-Contact% (pi) Contact% (pi) Zone% (pi) Pace (pi) FRM K/9+ BB/9+ K/BB+ H/9+ HR/9+ AVG+ WHIP+ BABIP+ LOB%+ K%+ BB%+ LD%+ GB%+ FB%+ HR/FB%+ Pull%+ Cent%+ Oppo%+ Soft%+ Med%+ Hard%+ playerid clust_grp
0 2015 Zack Greinke Dodgers 19 3 0 32 32 222.2 8.08 1.62 0.57 0.229 86.50% 48.00% 7.30% 1.66 2.76 3.22 5.3 31 1.66 1 0 0 0 222.2 843 148 43 41 14 40 1 5 7 0 200 281 193 112 18 1161 2078 3239 101 12 12 0 8.08 1.62 5.00 5.98 0.57 0.185 0.84 0.229 86.50% 2.76 1.46 19.10% 48.00% 32.90% 9.30% 7.30% 4.30% 0.00% 46.9 222.2 2.81 3.22 6.76 56.24 6.24 1.06 1.03 0.87 1.22 31 5.47 0.90 50.70% 91.8 20.0 1.22 33.30% 69.10% 47.50% 60.60% 85.10% 74.80% 39.90% 64.10% 12.00% 0 0 0 44 73 83 23.70% 4.70% 3.27 4.08 -1.10 33.30% 64.90% 47.50% 57.80% 85.50% 74.80% 45.00% 22.9 9.4 4.0 0.1 4.1 31 - 31 19.00% 38.60% 36.10% 25.30% 21.70% 51.50% 26.80% 3.21 30.10% 33.60% 65.20% 47.50% 58.00% 85.90% 74.90% 44.10% 23.4 3.9 103 55 187 68 58 74 65 77 118 115 61 91 104 101 64 102 101 96 117 99 91 1943 0
1 2018 Jacob deGrom Mets 10 9 0 32 32 217.0 11.16 1.91 0.41 0.281 82.00% 46.40% 6.30% 1.70 1.99 2.60 9.0 30 1.70 1 0 0 0 217.0 835 152 48 41 10 46 3 5 2 0 269 235 160 111 26 999 2213 3212 86 16 9 2 11.16 1.91 5.85 6.30 0.41 0.194 0.91 0.281 82.00% 1.99 1.47 21.90% 46.40% 31.60% 16.30% 6.30% 6.80% 22.20% 75.7 217.0 2.16 2.60 5.85 52.57 5.70 1.14 1.06 0.88 1.01 31 4.62 0.52 52.10% 96.0 23.9 1.43 37.90% 68.80% 51.70% 57.70% 79.70% 70.80% 44.60% 66.50% 15.10% 0 0 0 45 49 64 32.20% 5.50% 2.78 3.57 -0.29 36.80% 66.30% 51.70% 53.90% 79.90% 70.70% 50.70% 21.3 9.6 0.4 0.1 0.5 30 - 30 26.70% 42.30% 32.00% 25.60% 25.20% 48.20% 26.60% 2.60 38.90% 36.70% 66.10% 51.80% 52.40% 80.40% 70.80% 51.30% 21.9 -2.1 130 58 226 75 38 80 70 96 112 144 64 101 106 92 50 106 92 101 138 104 76 10954 0
2 2015 Jake Arrieta Cubs 22 6 0 33 33 229.0 9.28 1.89 0.39 0.246 80.00% 56.20% 7.80% 1.77 2.35 2.61 7.0 29 1.77 4 3 0 0 229.0 870 150 52 45 10 48 2 6 6 0 236 318 129 119 12 1202 2236 3438 118 22 14 3 9.28 1.89 4.92 5.90 0.39 0.184 0.86 0.246 80.00% 2.35 2.47 21.00% 56.20% 22.80% 9.30% 7.80% 6.90% 21.40% 60.0 229.0 2.77 2.61 5.93 56.13 6.04 0.92 0.92 0.87 0.83 29 5.78 0.67 50.70% 94.6 23.7 1.36 34.20% 63.00% 46.90% 61.30% 86.70% 76.30% 44.10% 60.20% 11.10% 0 0 0 45 61 68 27.10% 5.50% 2.75 4.64 -0.58 32.80% 62.30% 47.00% 59.00% 85.90% 76.10% 47.90% 22.7 9.1 2.9 -0.8 2.1 29 - 29 21.60% 34.30% 37.80% 27.90% 22.80% 55.20% 22.10% 2.90 33.80% 33.00% 61.80% 46.90% 59.50% 85.60% 76.20% 48.40% 23.3 2.2 118 64 184 67 40 73 67 82 110 131 71 100 121 70 68 90 106 106 123 106 75 4153 0
3 2014 Clayton Kershaw Dodgers 21 3 0 27 27 198.1 10.85 1.41 0.41 0.278 81.60% 51.80% 6.60% 1.77 1.81 2.08 7.9 26 1.77 6 2 0 0 198.1 749 139 42 39 9 31 0 2 7 2 239 243 137 89 19 835 1887 2722 124 12 8 1 10.85 1.41 7.71 6.31 0.41 0.194 0.86 0.278 81.60% 1.81 1.77 19.00% 51.80% 29.20% 13.90% 6.60% 4.90% 12.50% 62.5 198.1 1.84 2.08 5.04 43.95 4.92 0.99 0.95 0.86 0.92 21 4.97 0.14 55.40% 93.0 19.2 1.27 36.50% 68.60% 52.50% 55.60% 82.20% 72.90% 49.70% 68.80% 14.20% 0 0 0 51 49 56 31.90% 4.10% 2.09 5.63 -0.04 35.40% 67.90% 52.50% 50.30% 83.70% 73.00% 52.60% 23.0 8.2 0.8 -0.5 0.2 26 - 26 27.80% 43.80% 34.80% 21.40% 24.50% 51.20% 24.30% 1.94 37.30% 37.10% 67.00% 52.50% 53.60% 83.10% 73.00% 51.50% 23.7 -2.2 139 49 286 74 49 79 68 94 111 155 54 91 113 88 68 111 100 82 133 98 82 2036 0
4 2013 Clayton Kershaw Dodgers 16 9 0 33 33 236.0 8.85 1.98 0.42 0.251 80.60% 46.00% 5.80% 1.83 2.39 2.88 7.2 25 1.83 3 2 0 0 236.0 908 164 55 48 11 52 2 3 12 2 232 278 189 137 18 1142 2286 3428 108 17 15 3 8.85 1.98 4.46 6.25 0.42 0.192 0.92 0.251 80.60% 2.39 1.47 22.70% 46.00% 31.30% 9.50% 5.80% 6.10% 20.00% 59.8 236.0 2.87 2.88 4.90 47.70 5.36 0.99 0.95 0.88 1.11 30 5.45 -0.51 60.70% 92.6 37.8 1.82 32.50% 65.80% 48.40% 60.90% 84.70% 76.30% 47.60% 65.10% 11.40% 0 0 0 51 64 76 25.60% 5.70% 2.99 4.12 -0.56 33.10% 63.70% 48.50% 55.00% 87.00% 76.10% 50.20% 22.4 8.9 2.5 -0.8 1.7 25 - 25 19.80% 43.10% 35.40% 21.50% 14.40% 56.70% 28.90% 2.92 32.50% 33.90% 63.50% 48.50% 56.30% 87.30% 76.30% 49.20% 23.4 -0.6 118 67 177 73 47 78 71 86 109 130 73 106 101 95 57 114 97 84 86 106 97 2036 0
In [185]:
pw_kmeans = pw_kmeans[(pw_kmeans['WAR'] >= 0)]
In [186]:
sns.lmplot(data = pw_kmeans, x = 'Strikes', y = 'WAR', hue = 'clust_grp', fit_reg = False)
Out[186]:
<seaborn.axisgrid.FacetGrid at 0x1a3c8bdd90>
In [187]:
# Recreate this scatterplot with plotly
fig = px.scatter(pw_kmeans, x="Strikes", y="WAR", color="clust_grp"
                 ,size= 'HBP'
                 ,hover_data=['Name'])
fig.show()

Question 5: t_SNE and UMAP

t-SNE

In [188]:
# Use numeric varibles except for playerid and season
tsne_embedded = TSNE(n_components = 2
                 ,perplexity = 5
                 ,learning_rate = 500
                 ,n_iter = 1000
                 ,n_iter_without_progress = 200
 ).fit_transform(data_transformed)
In [189]:
tsne_embedded.shape
Out[189]:
(776, 2)
In [190]:
tsne_embedded_df = pd.DataFrame(tsne_embedded)
In [191]:
tsne_embedded_df = tsne_embedded_df.add_prefix('tsne')
tsne_embedded_df.head()
Out[191]:
tsne0 tsne1
0 -39.383324 -14.520823
1 -40.591110 -17.136038
2 -66.401459 -16.493061
3 -67.486702 -30.304983
4 -71.386848 -13.094769
In [192]:
pw_tsne = pd.concat([pw_cpy, tsne_embedded_df], axis = 1)
In [193]:
pw_tsne.head()
Out[193]:
Season Name Team W L SV G GS IP K/9 BB/9 HR/9 BABIP LOB% GB% HR/FB ERA FIP xFIP WAR Age ERA.1 CG ShO SV.1 BS IP.1 TBF H R ER HR BB IBB HBP WP BK SO GB FB LD IFFB Balls Strikes Pitches RS IFH BU BUH K/9.1 BB/9.1 K/BB H/9 HR/9.1 AVG WHIP BABIP.1 LOB%.1 FIP.1 GB/FB LD% GB%.1 FB% IFFB% HR/FB.1 IFH% BUH% Starting Start-IP tERA xFIP.1 WPA RE24 REW pLI inLI gmLI exLI Pulls WPA/LI Clutch FB%.1 FBv wFB wFB/C O-Swing% Z-Swing% Swing% O-Contact% Z-Contact% Contact% Zone% F-Strike% SwStr% HLD SD MD ERA- FIP- xFIP- K% BB% SIERA RS/9 E-F O-Swing% (pfx) Z-Swing% (pfx) Swing% (pfx) O-Contact% (pfx) Z-Contact% (pfx) Contact% (pfx) Zone% (pfx) Pace RA9-WAR BIP-Wins LOB-Wins FDP-Wins Age Rng K-BB% Pull% Cent% Oppo% Soft% Med% Hard% kwERA TTO% O-Swing% (pi) Z-Swing% (pi) Swing% (pi) O-Contact% (pi) Z-Contact% (pi) Contact% (pi) Zone% (pi) Pace (pi) FRM K/9+ BB/9+ K/BB+ H/9+ HR/9+ AVG+ WHIP+ BABIP+ LOB%+ K%+ BB%+ LD%+ GB%+ FB%+ HR/FB%+ Pull%+ Cent%+ Oppo%+ Soft%+ Med%+ Hard%+ playerid tsne0 tsne1
0 2015 Zack Greinke Dodgers 19 3 0 32 32 222.2 8.08 1.62 0.57 0.229 86.50% 48.00% 7.30% 1.66 2.76 3.22 5.3 31 1.66 1 0 0 0 222.2 843 148 43 41 14 40 1 5 7 0 200 281 193 112 18 1161 2078 3239 101 12 12 0 8.08 1.62 5.00 5.98 0.57 0.185 0.84 0.229 86.50% 2.76 1.46 19.10% 48.00% 32.90% 9.30% 7.30% 4.30% 0.00% 46.9 222.2 2.81 3.22 6.76 56.24 6.24 1.06 1.03 0.87 1.22 31 5.47 0.90 50.70% 91.8 20.0 1.22 33.30% 69.10% 47.50% 60.60% 85.10% 74.80% 39.90% 64.10% 12.00% 0 0 0 44 73 83 23.70% 4.70% 3.27 4.08 -1.10 33.30% 64.90% 47.50% 57.80% 85.50% 74.80% 45.00% 22.9 9.4 4.0 0.1 4.1 31 - 31 19.00% 38.60% 36.10% 25.30% 21.70% 51.50% 26.80% 3.21 30.10% 33.60% 65.20% 47.50% 58.00% 85.90% 74.90% 44.10% 23.4 3.9 103 55 187 68 58 74 65 77 118 115 61 91 104 101 64 102 101 96 117 99 91 1943 -39.383324 -14.520823
1 2018 Jacob deGrom Mets 10 9 0 32 32 217.0 11.16 1.91 0.41 0.281 82.00% 46.40% 6.30% 1.70 1.99 2.60 9.0 30 1.70 1 0 0 0 217.0 835 152 48 41 10 46 3 5 2 0 269 235 160 111 26 999 2213 3212 86 16 9 2 11.16 1.91 5.85 6.30 0.41 0.194 0.91 0.281 82.00% 1.99 1.47 21.90% 46.40% 31.60% 16.30% 6.30% 6.80% 22.20% 75.7 217.0 2.16 2.60 5.85 52.57 5.70 1.14 1.06 0.88 1.01 31 4.62 0.52 52.10% 96.0 23.9 1.43 37.90% 68.80% 51.70% 57.70% 79.70% 70.80% 44.60% 66.50% 15.10% 0 0 0 45 49 64 32.20% 5.50% 2.78 3.57 -0.29 36.80% 66.30% 51.70% 53.90% 79.90% 70.70% 50.70% 21.3 9.6 0.4 0.1 0.5 30 - 30 26.70% 42.30% 32.00% 25.60% 25.20% 48.20% 26.60% 2.60 38.90% 36.70% 66.10% 51.80% 52.40% 80.40% 70.80% 51.30% 21.9 -2.1 130 58 226 75 38 80 70 96 112 144 64 101 106 92 50 106 92 101 138 104 76 10954 -40.591110 -17.136038
2 2015 Jake Arrieta Cubs 22 6 0 33 33 229.0 9.28 1.89 0.39 0.246 80.00% 56.20% 7.80% 1.77 2.35 2.61 7.0 29 1.77 4 3 0 0 229.0 870 150 52 45 10 48 2 6 6 0 236 318 129 119 12 1202 2236 3438 118 22 14 3 9.28 1.89 4.92 5.90 0.39 0.184 0.86 0.246 80.00% 2.35 2.47 21.00% 56.20% 22.80% 9.30% 7.80% 6.90% 21.40% 60.0 229.0 2.77 2.61 5.93 56.13 6.04 0.92 0.92 0.87 0.83 29 5.78 0.67 50.70% 94.6 23.7 1.36 34.20% 63.00% 46.90% 61.30% 86.70% 76.30% 44.10% 60.20% 11.10% 0 0 0 45 61 68 27.10% 5.50% 2.75 4.64 -0.58 32.80% 62.30% 47.00% 59.00% 85.90% 76.10% 47.90% 22.7 9.1 2.9 -0.8 2.1 29 - 29 21.60% 34.30% 37.80% 27.90% 22.80% 55.20% 22.10% 2.90 33.80% 33.00% 61.80% 46.90% 59.50% 85.60% 76.20% 48.40% 23.3 2.2 118 64 184 67 40 73 67 82 110 131 71 100 121 70 68 90 106 106 123 106 75 4153 -66.401459 -16.493061
3 2014 Clayton Kershaw Dodgers 21 3 0 27 27 198.1 10.85 1.41 0.41 0.278 81.60% 51.80% 6.60% 1.77 1.81 2.08 7.9 26 1.77 6 2 0 0 198.1 749 139 42 39 9 31 0 2 7 2 239 243 137 89 19 835 1887 2722 124 12 8 1 10.85 1.41 7.71 6.31 0.41 0.194 0.86 0.278 81.60% 1.81 1.77 19.00% 51.80% 29.20% 13.90% 6.60% 4.90% 12.50% 62.5 198.1 1.84 2.08 5.04 43.95 4.92 0.99 0.95 0.86 0.92 21 4.97 0.14 55.40% 93.0 19.2 1.27 36.50% 68.60% 52.50% 55.60% 82.20% 72.90% 49.70% 68.80% 14.20% 0 0 0 51 49 56 31.90% 4.10% 2.09 5.63 -0.04 35.40% 67.90% 52.50% 50.30% 83.70% 73.00% 52.60% 23.0 8.2 0.8 -0.5 0.2 26 - 26 27.80% 43.80% 34.80% 21.40% 24.50% 51.20% 24.30% 1.94 37.30% 37.10% 67.00% 52.50% 53.60% 83.10% 73.00% 51.50% 23.7 -2.2 139 49 286 74 49 79 68 94 111 155 54 91 113 88 68 111 100 82 133 98 82 2036 -67.486702 -30.304983
4 2013 Clayton Kershaw Dodgers 16 9 0 33 33 236.0 8.85 1.98 0.42 0.251 80.60% 46.00% 5.80% 1.83 2.39 2.88 7.2 25 1.83 3 2 0 0 236.0 908 164 55 48 11 52 2 3 12 2 232 278 189 137 18 1142 2286 3428 108 17 15 3 8.85 1.98 4.46 6.25 0.42 0.192 0.92 0.251 80.60% 2.39 1.47 22.70% 46.00% 31.30% 9.50% 5.80% 6.10% 20.00% 59.8 236.0 2.87 2.88 4.90 47.70 5.36 0.99 0.95 0.88 1.11 30 5.45 -0.51 60.70% 92.6 37.8 1.82 32.50% 65.80% 48.40% 60.90% 84.70% 76.30% 47.60% 65.10% 11.40% 0 0 0 51 64 76 25.60% 5.70% 2.99 4.12 -0.56 33.10% 63.70% 48.50% 55.00% 87.00% 76.10% 50.20% 22.4 8.9 2.5 -0.8 1.7 25 - 25 19.80% 43.10% 35.40% 21.50% 14.40% 56.70% 28.90% 2.92 32.50% 33.90% 63.50% 48.50% 56.30% 87.30% 76.30% 49.20% 23.4 -0.6 118 67 177 73 47 78 71 86 109 130 73 106 101 95 57 114 97 84 86 106 97 2036 -71.386848 -13.094769

The t-SNE method did a fairly good job of separating pitchers with high WAR scores from those with lower scores. Most pitchers with high WAR scores, greater than 4, appear in the upper left side of the graph where the tsne0 value is less than -20 and the tsne1 values are average to high. Those pitchers with low scores, less than 3, appear in the upper and lower right side of the graph where the tsne0 value is greater than -20. For lower scoring pitchers, the tsne1 value does not seem to have an effect. Pitchers with scores between 3 and 4 appear scattered throughout the graph so these were not classified as well.

In [194]:
# Create scatterplot with plotly
fig = px.scatter(pw_tsne, x="tsne0", y="tsne1", color="WAR"
                 #,size= 'HBP'
                 ,hover_data=['Name'])
fig.show()

UMAP unsupervised

In [195]:
reducer = umap.UMAP(n_neighbors = 5
                    ,min_dist = 0.0
                    ,n_components = 15
                    ,metric = 'euclidean'
                   )
In [196]:
umap_embedding = reducer.fit_transform(data_transformed)
In [197]:
umap_embedding.shape
Out[197]:
(776, 15)
In [198]:
umap_embedding_df = pd.DataFrame(umap_embedding)
In [199]:
umap_embedding_df = umap_embedding_df.add_prefix('umap')
umap_embedding_df.head()
Out[199]:
umap0 umap1 umap2 umap3 umap4 umap5 umap6 umap7 umap8 umap9 umap10 umap11 umap12 umap13 umap14
0 -0.959095 0.742517 -0.342717 0.283599 -0.041698 -0.597406 0.367899 0.156559 -0.135494 -0.004563 -0.106252 0.180898 -0.255766 0.145360 -0.314972
1 -1.278238 0.483768 -0.452571 0.455866 0.172076 -0.726319 0.542365 0.031357 -0.161453 0.025053 0.057288 0.275988 -0.361377 0.165619 -0.112474
2 -2.622020 0.862631 0.061042 0.064830 0.786090 -0.678355 0.100389 -0.021219 -0.114247 0.023965 0.443649 0.313649 -0.080830 -0.398070 -0.473914
3 -1.185790 -0.804008 -1.090984 1.226324 0.456543 -1.108320 1.342379 -0.303051 -0.269800 0.148545 0.379089 0.517045 -0.810955 0.566231 0.778592
4 -2.671853 0.833226 0.106715 0.022607 0.836682 -0.738114 0.110776 -0.024334 -0.124127 0.004256 0.435345 0.270310 -0.044712 -0.415800 -0.431966
In [200]:
pw_umap = pd.concat([pw_cpy, umap_embedding_df], axis = 1)
In [201]:
%matplotlib inline
sns.set(style='white', rc={'figure.figsize':(25,25)})
In [202]:
#plt.scatter(umap_embedding[:,0], umap_embedding[:,1], c = data_transformed[ : , 0])
In [203]:
# Create scatterplot with plotly
fig = px.scatter(pw_umap, x="umap0", y="umap1", color="WAR"
                 #,size= 'HBP'
                 ,hover_data=['Name'])
fig.show()

UMAP supervised

In [204]:
reducer = umap.UMAP(n_neighbors = 5
                    ,min_dist = 0.0
                    ,n_components = 15
                    ,metric = 'euclidean'
                   )
In [205]:
data_transformed.shape
data_transformed.dtype
Out[205]:
dtype('float64')
In [206]:
# let's perform UMAP with a target
umap_embedding = umap.UMAP().fit_transform(data_transformed[ : , 1:100], y=data_transformed[ : , 0])
In [207]:
umap_embedding.shape
umap_embedding
Out[207]:
array([[ 3.8477626 , -3.0591156 ],
       [ 4.003337  , -3.2282584 ],
       [ 3.8109627 , -3.3949153 ],
       ...,
       [-1.7249231 , -0.6555294 ],
       [-1.3848749 , -0.28857303],
       [-1.703901  , -0.30786255]], dtype=float32)
In [208]:
umap_embedding_df = pd.DataFrame(umap_embedding)
In [209]:
umap_embedding_df = umap_embedding_df.add_prefix('umap')
umap_embedding_df.head()
Out[209]:
umap0 umap1
0 3.847763 -3.059116
1 4.003337 -3.228258
2 3.810963 -3.394915
3 4.301785 0.099093
4 3.878429 -3.637072
In [210]:
pw_umap = pd.concat([pw_cpy, umap_embedding_df], axis = 1)
In [211]:
%matplotlib inline
sns.set(style='white', rc={'figure.figsize':(25,25)})
In [212]:
#plt.scatter(umap_embedding[:,0], umap_embedding[:,1], c = data_transformed[ : , 0])

The supervised UMAP method seems to mirror the unsupervised umap and t-SNE methods with high WAR scores in the upper and lower right portion of the graph. These high scores are associated with a high umap0 value and a low or average umap1 value. Pitchers with scores below 3 appear toward the left where the umap0 value is above average but below the umap0 value for high WAR scores. Most pitchers with WAR scores between 3 and 4 appear together in between the low and the high, although it is difficult to see. Hovering over the dots helps to show this. The second graph shows the distinction between the high and the low a little better than the first graph. You can see a band of light yellow and orange dots between the darker red/orange and green/blue/purple dots.

In [213]:
# Create scatterplot with plotly
fig = px.scatter(pw_umap, x="umap0", y="umap1", color="WAR"
                 #,size= 'HBP'
                 ,hover_data=['Name'])
fig.show()
In [214]:
classes = [
    '1',
    '2',
    '3',
    '4',
    '5',
    '6',
    '7',
    '8',
    '9',
    '10']
In [215]:
fig, ax = plt.subplots(1, figsize=(14, 10))
plt.scatter(*umap_embedding.T, s=30.0, c=pw_umap['WAR'], cmap='Spectral', alpha=1.0)
plt.setp(ax, xticks=[], yticks=[])
cbar = plt.colorbar(boundaries=np.arange(11)-0.5)
cbar.set_ticks(np.arange(10))
cbar.set_ticklabels(classes)
plt.title('Fashion MNIST Embedded via UMAP using Labels');
In [ ]: